diff --git a/epochX/cudacpp/.gitignore b/epochX/cudacpp/.gitignore index a25c916dce..0dda40b7a4 100644 --- a/epochX/cudacpp/.gitignore +++ b/epochX/cudacpp/.gitignore @@ -6,3 +6,5 @@ run_[0-9]* events.lhe* py3_model.pkl + +perf.data* diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc index d2b24bba27..7f4d65438d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -88,30 +95,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -119,29 +135,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index e54290d5a7..e714b3aa97 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -98,6 +98,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'], 'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc', s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h', + s+'gpu/mgOnGpuVectorsSplitMerge.h', s+'gpu/constexpr_math.h', s+'gpu/cudacpp_config.mk', s+'CMake/src/CMakeLists.txt' ], diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index b7cdf09c17..9a6856fdbb 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004445075988769531  +DEBUG: model prefixing takes 0.005223274230957031  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -149,21 +150,21 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.003 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -179,18 +180,18 @@ INFO: Finding symmetric diagrams for subprocess group epem_mupmum DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.060 s +Wrote files for 8 helas calls in 0.068 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.170 s +ALOHA: aloha creates 3 routines in 0.188 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.184 s +ALOHA: aloha creates 7 routines in 0.240 s FFV1 FFV1 FFV2 @@ -199,32 +200,32 @@ ALOHA: aloha creates 7 routines in 0.184 s FFV4 FFV2_4 FFV2_4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m2.396s -user 0m1.798s -sys 0m0.425s -Code generation completed in 2 seconds +real 0m2.135s +user 0m1.760s +sys 0m0.316s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -245,9 +246,10 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -274,9 +276,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc index 44aadd6b60..c91ca7c1ee 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -95,30 +102,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -126,29 +142,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 3c991f09cf..ab8e9518b0 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -2,7 +2,6 @@ This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 1.185530662536621) Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT ************************************************************ * * @@ -47,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004302024841308594  +DEBUG: model prefixing takes 0.005151510238647461  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,32 +150,32 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.003 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. +Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.190 s +ALOHA: aloha creates 4 routines in 0.248 s FFV1 FFV1 FFV2 @@ -184,17 +184,17 @@ ALOHA: aloha creates 4 routines in 0.190 s FFV4 FFV2_4 FFV2_4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m1.709s -user 0m1.562s -sys 0m0.115s -Code generation completed in 2 seconds +real 0m0.661s +user 0m0.598s +sys 0m0.050s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc index 44aadd6b60..c91ca7c1ee 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -95,30 +102,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -126,29 +142,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 156f7ce8e7..cf7deaba7e 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004584789276123047  +DEBUG: model prefixing takes 0.0054302215576171875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.007 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -179,46 +180,46 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.009 s -Wrote files for 10 helas calls in 0.078 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 0.071 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.103 s +ALOHA: aloha creates 2 routines in 0.137 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.088 s +ALOHA: aloha creates 4 routines in 0.124 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m2.028s -user 0m1.664s -sys 0m0.358s -Code generation completed in 2 seconds +real 0m1.917s +user 0m1.650s +sys 0m0.263s +Code generation completed in 1 seconds ************************************************************ * * * W E L C O M E to * @@ -239,9 +240,10 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -268,9 +270,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 1f90d3c408..b2819b2bc1 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004430294036865234  +DEBUG: model prefixing takes 0.005286693572998047  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,45 +151,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.006 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.121 s +ALOHA: aloha creates 2 routines in 0.133 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.508s -user 0m0.439s -sys 0m0.064s +real 0m0.515s +user 0m0.455s +sys 0m0.055s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 0af9646028..c12ba807ab 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -2,7 +2,6 @@ This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 0.5061478614807129) Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT ************************************************************ * * @@ -47,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.01866316795349121  +DEBUG: model prefixing takes 0.005397319793701172  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.010 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -159,21 +159,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.020 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 INFO: Processing color information for process: g g > t t~ g @2 @@ -201,22 +201,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.088 s -Wrote files for 46 helas calls in 0.403 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s +Wrote files for 46 helas calls in 0.187 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.419 s +ALOHA: aloha creates 5 routines in 0.297 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.553 s +ALOHA: aloha creates 10 routines in 0.289 s VVV1 VVV1 FFV1 @@ -226,32 +226,32 @@ ALOHA: aloha creates 10 routines in 0.553 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m5.986s -user 0m4.846s -sys 0m0.948s -Code generation completed in 6 seconds +real 0m2.606s +user 0m2.289s +sys 0m0.315s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -272,9 +272,10 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -301,9 +302,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc index 9e3ce9d917..1b112d40a3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index e50d05daa6..6f0259b8d8 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004053354263305664  +DEBUG: model prefixing takes 0.005203723907470703  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.016 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -179,22 +180,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxg DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (16 diagrams) in 0.030 s -Wrote files for 36 helas calls in 0.096 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s +Wrote files for 36 helas calls in 0.118 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.242 s +ALOHA: aloha creates 5 routines in 0.302 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.216 s +ALOHA: aloha creates 10 routines in 0.285 s VVV1 VVV1 FFV1 @@ -204,32 +205,32 @@ ALOHA: aloha creates 10 routines in 0.216 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.399s -user 0m2.037s -sys 0m0.357s -Code generation completed in 3 seconds +real 0m2.435s +user 0m2.125s +sys 0m0.303s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -250,9 +251,10 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -279,9 +281,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc index 9e3ce9d917..1b112d40a3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index ab60b4e5bd..d6e3e0901b 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0042188167572021484  +DEBUG: model prefixing takes 0.0052950382232666016  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.017 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. +Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.230 s +ALOHA: aloha creates 5 routines in 0.308 s VVV1 VVV1 FFV1 @@ -186,17 +187,17 @@ ALOHA: aloha creates 5 routines in 0.230 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.642s -user 0m0.586s -sys 0m0.050s +real 0m0.754s +user 0m0.689s +sys 0m0.060s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc index 9e3ce9d917..1b112d40a3 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 8c941153c6..e0b6ab8c49 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004433155059814453  +DEBUG: model prefixing takes 0.005160331726074219  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.125 s +1 processes with 123 diagrams generated in 0.153 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -179,22 +180,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxgg DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.307 s -Wrote files for 222 helas calls in 0.475 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.406 s +Wrote files for 222 helas calls in 0.628 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.280 s +ALOHA: aloha creates 5 routines in 0.310 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.246 s +ALOHA: aloha creates 10 routines in 0.290 s VVV1 VVV1 FFV1 @@ -207,32 +208,32 @@ ALOHA: aloha creates 10 routines in 0.246 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m3.426s -user 0m3.041s -sys 0m0.376s -Code generation completed in 4 seconds +real 0m3.744s +user 0m3.440s +sys 0m0.297s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -253,9 +254,10 @@ Code generation completed in 4 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -282,9 +284,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc index 91a7f9998e..9ade78ca77 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -119,30 +126,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -150,29 +166,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 691a9d08c7..73ad830e21 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004384040832519531  +DEBUG: model prefixing takes 0.005399942398071289  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.118 s +1 processes with 123 diagrams generated in 0.159 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.366 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. +Generated helas calls for 1 subprocesses (123 diagrams) in 0.422 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.231 s +ALOHA: aloha creates 5 routines in 0.298 s VVV1 VVV1 FFV1 @@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.231 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.208s -user 0m1.150s -sys 0m0.049s +real 0m1.404s +user 0m1.335s +sys 0m0.062s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc index 91a7f9998e..9ade78ca77 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -119,30 +126,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -150,29 +166,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 5908592d13..7fd19e2034 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0061588287353515625  +DEBUG: model prefixing takes 0.005022764205932617  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,27 +151,27 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.427 s +1 processes with 1240 diagrams generated in 1.815 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 7s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -181,22 +182,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.574 s -Wrote files for 2281 helas calls in 17.935 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.274 s +Wrote files for 2281 helas calls in 17.122 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.379 s +ALOHA: aloha creates 5 routines in 0.294 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.232 s +ALOHA: aloha creates 10 routines in 0.289 s VVV1 VVV1 FFV1 @@ -209,31 +210,31 @@ ALOHA: aloha creates 10 routines in 0.232 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m31.040s -user 0m30.219s -sys 0m0.591s +real 0m31.007s +user 0m30.369s +sys 0m0.485s Code generation completed in 31 seconds ************************************************************ * * @@ -255,9 +256,10 @@ Code generation completed in 31 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -284,9 +286,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index dea7f9fdb2..de5e79f9a0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -215,30 +222,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -246,29 +262,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 4f7b5172f1..f200ff33c1 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004235267639160156  +DEBUG: model prefixing takes 0.0051381587982177734  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.490 s +1 processes with 1240 diagrams generated in 1.791 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.122 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.258 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.290 s +ALOHA: aloha creates 5 routines in 0.321 s VVV1 VVV1 FFV1 @@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.290 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m10.012s -user 0m9.867s -sys 0m0.109s -Code generation completed in 10 seconds +real 0m12.319s +user 0m12.172s +sys 0m0.101s +Code generation completed in 12 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc index dea7f9fdb2..de5e79f9a0 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -215,30 +222,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -246,29 +262,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 71b7095c67..c7d0c93632 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004422187805175781  +DEBUG: model prefixing takes 0.005384206771850586  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,21 +166,21 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.058 s +8 processes with 40 diagrams generated in 0.075 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -213,46 +214,46 @@ INFO: Finding symmetric diagrams for subprocess group gux_ttxux DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.026 s -Wrote files for 32 helas calls in 0.131 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s +Wrote files for 32 helas calls in 0.159 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.106 s +ALOHA: aloha creates 2 routines in 0.133 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.094 s +ALOHA: aloha creates 4 routines in 0.123 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m2.314s -user 0m1.828s -sys 0m0.404s +real 0m2.202s +user 0m1.892s +sys 0m0.310s Code generation completed in 2 seconds ************************************************************ * * @@ -274,9 +275,10 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -303,9 +305,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index d16040de18..15d45a8d6f 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004274129867553711  +DEBUG: model prefixing takes 0.005454063415527344  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,13 +166,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.059 s +8 processes with 40 diagrams generated in 0.079 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -183,45 +184,45 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=1 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.023 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=1 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. +Generated helas calls for 2 subprocesses (10 diagrams) in 0.053 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.105 s +ALOHA: aloha creates 2 routines in 0.136 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.535s -user 0m0.481s -sys 0m0.048s +real 0m0.769s +user 0m0.593s +sys 0m0.044s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index faef5b2d67..324a98d14f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -46,16 +46,17 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -129,14 +130,14 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -152,50 +153,50 @@ INFO: Finding symmetric diagrams for subprocess group gg_bbx DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1577]  Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s -Wrote files for 12 helas calls in 0.062 s +Wrote files for 12 helas calls in 0.074 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.193 s +ALOHA: aloha creates 4 routines in 0.243 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.178 s +ALOHA: aloha creates 8 routines in 0.232 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m2.118s -user 0m1.750s -sys 0m0.364s +real 0m2.150s +user 0m1.847s +sys 0m0.287s Code generation completed in 2 seconds ************************************************************ * * @@ -217,9 +218,10 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -246,9 +248,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc index 94b1137d64..2e30c78630 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -98,30 +105,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -129,29 +145,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 5208ed190c..56ed839e3c 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -46,65 +46,18 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft -INFO: download model from http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models  ---2025-10-22 11:47:55-- http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz -Resolving madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)... 130.104.1.243 -Connecting to madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)|130.104.1.243|:80... connected. -HTTP request sent, awaiting response... 200 OK -Length: 50876 (50K) [application/x-gzip] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... ......... 100% 921K=0.05s - -2025-10-22 11:47:55 (921 KB/s) - ‘tmp.tgz’ saved [50876/50876] - -heft/ -heft/write_param_card.py -heft/restrict_ckm.dat -heft/couplings.py -heft/HEFT_UFO.log -heft/lorentz.py -heft/__init__.py -heft/__pycache__/ -heft/particles.py -heft/object_library.py -heft/restrict_default.dat -heft/restrict_zeromass_ckm.dat -heft/restrict_no_b_mass.dat -heft/function_library.py -heft/parameters.py -heft/py3_model.pkl -heft/coupling_orders.py -heft/restrict_no_tau_mass.dat -heft/vertices.py -heft/restrict_no_masses.dat -heft/__pycache__/write_param_card.cpython-311.pyc -heft/__pycache__/parameters.cpython-311.pyc -heft/__pycache__/function_library.cpython-311.pyc -heft/__pycache__/coupling_orders.cpython-311.pyc -heft/__pycache__/object_library.cpython-311.pyc -heft/__pycache__/couplings.cpython-311.pyc -heft/__pycache__/particles.cpython-311.pyc -heft/__pycache__/vertices.cpython-311.pyc -heft/__pycache__/lorentz.cpython-311.pyc -heft/__pycache__/__init__.cpython-311.pyc -INFO: reload from .py file -INFO: load particles -INFO: load vertices -WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.004904985427856445  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -170,49 +123,49 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.004 s +1 processes with 4 diagrams generated in 0.006 s Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. -Generated helas calls for 1 subprocesses (4 diagrams) in 0.007 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. +Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.185 s +ALOHA: aloha creates 4 routines in 0.249 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.821s -user 0m0.568s -sys 0m0.084s +real 0m0.631s +user 0m0.567s +sys 0m0.050s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc index 94b1137d64..2e30c78630 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -98,30 +105,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -129,29 +145,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index b5ca9e6bb6..2a8b270382 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004863262176513672  +DEBUG: model prefixing takes 0.005410671234130859  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -180,7 +181,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.093 s +4 processes with 8 diagrams generated in 0.106 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -222,21 +223,21 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.520 s +12 processes with 144 diagrams generated in 0.618 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 INFO: Processing color information for process: g u > t t~ w+ d @1 @@ -350,18 +351,18 @@ INFO: Finding symmetric diagrams for subprocess group dux_ttxwm DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.172 s -Wrote files for 212 helas calls in 0.856 s +Generated helas calls for 8 subprocesses (76 diagrams) in 0.191 s +Wrote files for 212 helas calls in 0.803 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 3 routines in 0.166 s +ALOHA: aloha creates 3 routines in 0.191 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.150 s +ALOHA: aloha creates 6 routines in 0.189 s FFV1 FFV1 FFV1 @@ -369,31 +370,31 @@ ALOHA: aloha creates 6 routines in 0.150 s FFV2 FFV2 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h -INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h +INFO: Created file HelAmps_sm_no_b_mass.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m4.809s -user 0m4.082s -sys 0m0.695s +real 0m4.603s +user 0m4.081s +sys 0m0.510s Code generation completed in 5 seconds ************************************************************ * * @@ -415,9 +416,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -444,9 +446,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc index 04c22fd369..7bf4387b35 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc index 04c22fd369..7bf4387b35 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 0da34a0aa2..8a76eb7123 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0046498775482177734  +DEBUG: model prefixing takes 0.005301475524902344  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -167,7 +168,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.025 s +5 processes with 7 diagrams generated in 0.029 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -207,7 +208,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.114 s +13 processes with 76 diagrams generated in 0.138 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -373,21 +374,21 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.872 s +65 processes with 1119 diagrams generated in 1.745 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Processing color information for process: g g > t t~ g g @2 @@ -688,22 +689,22 @@ INFO: Finding symmetric diagrams for subprocess group uux_ttx DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1577]  -Generated helas calls for 18 subprocesses (372 diagrams) in 1.392 s -Wrote files for 810 helas calls in 2.303 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.223 s +Wrote files for 810 helas calls in 2.665 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.281 s +ALOHA: aloha creates 5 routines in 0.325 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.237 s +ALOHA: aloha creates 10 routines in 0.297 s VVV1 VVV1 FFV1 @@ -716,32 +717,32 @@ ALOHA: aloha creates 10 routines in 0.237 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m10.952s -user 0m9.707s -sys 0m1.156s -Code generation completed in 11 seconds +real 0m10.200s +user 0m9.319s +sys 0m0.845s +Code generation completed in 10 seconds ************************************************************ * * * W E L C O M E to * @@ -762,9 +763,10 @@ Code generation completed in 11 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -791,9 +793,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc index 04c22fd369..7bf4387b35 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc index 9e3ce9d917..1b112d40a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc index 91a7f9998e..9ade78ca77 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -119,30 +126,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -150,29 +166,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc index 767405ac3b..c2a09ea450 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc index db09ae848e..2a30d37c98 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc index 13c347c712..542cb89303 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc index 82ceb3958f..c0daf8c97c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index e728335e4c..b380cac4ff 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -46,16 +46,17 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -72,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.07860422134399414  +DEBUG: model prefixing takes 0.12215161323547363  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -87,21 +88,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.729 s +1 processes with 72 diagrams generated in 3.569 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -116,22 +117,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxttx DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.132 s -Wrote files for 119 helas calls in 0.360 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.177 s +Wrote files for 119 helas calls in 0.369 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.215 s +ALOHA: aloha creates 5 routines in 0.300 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.214 s +ALOHA: aloha creates 10 routines in 0.292 s VVV5 VVV5 FFV1 @@ -141,32 +142,32 @@ ALOHA: aloha creates 10 routines in 0.214 s VVVV1 VVVV9 VVVV10 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m5.833s -user 0m5.426s -sys 0m0.391s -Code generation completed in 6 seconds +real 0m6.975s +user 0m6.611s +sys 0m0.326s +Code generation completed in 7 seconds ************************************************************ * * * W E L C O M E to * @@ -187,9 +188,10 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -216,9 +218,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc index 767405ac3b..c2a09ea450 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py index 33a89259f8..57a85b0614 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py @@ -116,10 +116,9 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters - param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) + exec("%s = %s" % (parameter.name, parameter.value)) except Exception: pass text = "## Not dependent paramater.\n" @@ -135,7 +134,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value, globals(), param_values)).real + value = complex(eval(param.value)).real else: value = param.value diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index 065f7b4329..fe50b4ec4d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -46,51 +46,17 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t -INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models  ---2025-10-22 11:49:03-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz -Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109 -Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected. -HTTP request sent, awaiting response... 200 Ok -Length: 80562 (79K) [application/x-tar] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... .......... 63% 830K 0s - 50K .......... .......... ........ 100% 124M=0.06s - -2025-10-22 11:49:03 (1.27 MB/s) - ‘tmp.tgz’ saved [80562/80562] - -SMEFTsim_topU3l_MwScheme_UFO/ -SMEFTsim_topU3l_MwScheme_UFO/__init__.py -SMEFTsim_topU3l_MwScheme_UFO/param_card_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/CT_couplings.py -SMEFTsim_topU3l_MwScheme_UFO/particles.py -SMEFTsim_topU3l_MwScheme_UFO/write_param_card.py -SMEFTsim_topU3l_MwScheme_UFO/decays.py -SMEFTsim_topU3l_MwScheme_UFO/parameters.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/object_library.py -SMEFTsim_topU3l_MwScheme_UFO/coupling_orders.py -SMEFTsim_topU3l_MwScheme_UFO/version.info -SMEFTsim_topU3l_MwScheme_UFO/function_library.py -SMEFTsim_topU3l_MwScheme_UFO/couplings.py -SMEFTsim_topU3l_MwScheme_UFO/propagators.py -SMEFTsim_topU3l_MwScheme_UFO/lorentz.py -SMEFTsim_topU3l_MwScheme_UFO/vertices.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat -fail to load model but auto_convert_model is on True. Trying to convert the model -convert model /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO -retry the load of the model +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -107,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.07803130149841309  +DEBUG: model prefixing takes 0.12230682373046875  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -116,42 +82,39 @@ Defined multiparticle l- = e- mu- Defined multiparticle vl = ve vm vt Defined multiparticle vl~ = ve~ vm~ vt~ Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ -INFO: Change particles name to pass to MG5 convention -Kept definitions of multiparticles p / j / l+ / l- / vl / vl~ unchanged -Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ generate g g > t t~ t t~ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.695 s +1 processes with 72 diagrams generated in 3.548 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.127 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. +Generated helas calls for 1 subprocesses (72 diagrams) in 0.176 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.281 s +ALOHA: aloha creates 5 routines in 0.291 s VVV5 VVV5 FFV1 @@ -161,17 +124,17 @@ ALOHA: aloha creates 5 routines in 0.281 s VVVV1 VVVV9 VVVV10 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m4.417s -user 0m3.862s -sys 0m0.114s +real 0m4.857s +user 0m4.752s +sys 0m0.071s Code generation completed in 5 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc index 767405ac3b..c2a09ea450 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 01968dc817..a9bfe6c199 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.071 s +1 processes with 6 diagrams generated in 0.099 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -578,48 +579,48 @@ INFO: Finding symmetric diagrams for subprocess group gg_t1t1x DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s -Wrote files for 16 helas calls in 0.065 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s +Wrote files for 16 helas calls in 0.082 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.125 s +ALOHA: aloha creates 3 routines in 0.168 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.118 s +ALOHA: aloha creates 6 routines in 0.166 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m2.714s -user 0m2.329s -sys 0m0.381s +real 0m2.927s +user 0m2.603s +sys 0m0.321s Code generation completed in 3 seconds ************************************************************ * * @@ -641,9 +642,10 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -670,9 +672,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 0c5c2efcaf..5a13b71204 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,47 +550,47 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.074 s +1 processes with 6 diagrams generated in 0.099 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.006 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. +Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.126 s +ALOHA: aloha creates 3 routines in 0.171 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.007s -user 0m0.940s -sys 0m0.062s +real 0m1.243s +user 0m1.181s +sys 0m0.055s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 463187a10a..fddf41b83f 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.089 s +1 processes with 3 diagrams generated in 0.093 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -578,45 +579,45 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s -Wrote files for 10 helas calls in 0.076 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 0.074 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.123 s +ALOHA: aloha creates 2 routines in 0.127 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.120 s +ALOHA: aloha creates 4 routines in 0.123 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m3.218s -user 0m2.778s -sys 0m0.430s +real 0m2.828s +user 0m2.467s +sys 0m0.319s Code generation completed in 3 seconds ************************************************************ * * @@ -638,9 +639,10 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -667,9 +669,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 9c4080b86d..460faec9c3 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -46,17 +46,15 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 -INFO: load particles -INFO: load vertices -DEBUG: model prefixing takes 0.6192381381988525  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -552,45 +550,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.063 s +1 processes with 3 diagrams generated in 0.094 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.095 s +ALOHA: aloha creates 2 routines in 0.127 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.922s -user 0m1.810s -sys 0m0.099s +real 0m1.237s +user 0m1.131s +sys 0m0.059s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 9875c9cf7a..edc9344409 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,12 +1,28 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:08:31 +DATE: 2025-12-07_19:56:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7544s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.07E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7292s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7216s + [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2144s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2081s + [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2222s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2147s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2103s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.18E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.149454e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.154870e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.182730e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138965e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2208s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2160s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.82E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2132s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2086s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 8192 events => throughput is 1.88E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.914270e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.875926e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.995666e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.978144e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2170s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2130s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2125s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.52E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.533255e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.573751e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.641624e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.689952e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2163s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2127s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.41E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2114s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.54E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.651338e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.604635e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.725193e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.785858e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2180s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.98E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2106s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2062s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.96E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.065060e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.064898e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.156200e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.133687e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6520s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6479s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0034s + [COUNTERS] PROGRAM TOTAL : 0.6427s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6387s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.24E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0033s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.427727e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.379742e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.442402e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.244351e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.123576e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.695733e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.069823e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971382e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.084747e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.641913e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.494944e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.396984e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.063740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.655022e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.415941e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.407117e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index fbf3c34fcc..be7b96d8c0 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,16 +1,32 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:08:56 +DATE: 2025-12-07_19:56:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7580s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7502s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7303s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7226s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.07E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2217s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2138s - [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2087s + [COUNTERS] Fortran MEs ( 1 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432777382586498E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2214s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2142s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2137s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2068s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.197154e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.187029e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.200720e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.204475e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432774839452045E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2161s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 2.99E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2088s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2060s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.06E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.577999e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.111052e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.183473e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.193882e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2183s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2155s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2126s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2099s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.30E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.468253e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.462565e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.468239e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.570036e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2199s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2171s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2122s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2096s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.46E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.276853e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.456259e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.494548e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.528481e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432778556608516E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2182s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2105s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.01E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.354967e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.297641e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.469737e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.574917e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432779972212775E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6719s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6677s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.25E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s + [COUNTERS] PROGRAM TOTAL : 0.6489s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6447s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.28E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.421145e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.134710e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.263812e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.989442e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.466407e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.704746e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.768150e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.522252e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.574848e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.795522e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.510215e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.183497e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.891814e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.251390e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.714240e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.681388e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 07ac440ea1..0c0bbfe6f2 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:08:44 +DATE: 2025-12-07_19:56:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7547s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7469s - [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7437s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7358s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2206s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2128s - [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2144s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2069s + [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2248s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2169s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2133s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2059s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138160e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.144515e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.141490e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.146572e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2174s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2129s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.92E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.989196e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.006790e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.027429e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.074281e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2195s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2156s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.30E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2093s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2058s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.53E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.540266e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.588652e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.722635e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.694742e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.26E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2101s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2065s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.44E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.634053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.643127e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.703762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.775081e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2186s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.11E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.160546e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.200792e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.303805e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.334925e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789453073233E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6515s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6475s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6432s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6392s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0033s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.593291e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.172927e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.163347e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.285125e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.056075e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.652551e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.054571e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.985084e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.089599e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.612272e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.480305e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.344657e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.035852e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.686897e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.419141e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.410595e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 9182ca8a9b..8cd13a51b8 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,7 +1,23 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:09 +DATE: 2025-12-07_19:56:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8533s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8106s - [COUNTERS] Fortran MEs ( 1 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8241s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7829s + [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.98E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4516s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4087s - [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4428s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4010s + [COUNTERS] Fortran MEs ( 1 ) : 0.0418s for 8192 events => throughput is 1.96E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4606s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4148s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0454s for 8192 events => throughput is 1.80E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4467s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4013s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0450s for 8192 events => throughput is 1.82E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.822539e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.859152e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.841641e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852910e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4390s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4130s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0257s for 8192 events => throughput is 3.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4261s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4010s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 8192 events => throughput is 3.31E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.221117e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.296764e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.252405e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.269668e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4339s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4171s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0164s for 8192 events => throughput is 4.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4191s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4034s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.34E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.116784e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.242283e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.216981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.292484e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4313s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4153s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4188s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4035s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0150s for 8192 events => throughput is 5.47E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.229787e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.373735e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.438042e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.485622e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4415s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4172s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4223s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3997s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0223s for 8192 events => throughput is 3.68E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.514185e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.683579e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.539500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.767547e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8618s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8570s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8463s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8416s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0040s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.853419e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.843252e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.409968e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.504663e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.832304e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.642084e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.660331e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.652174e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.861253e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.646569e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.014024e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.988601e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.853068e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.615581e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.417253e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413334e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 7fd8a9128c..90a6b10aea 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:38 +DATE: 2025-12-07_19:57:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8468s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8038s - [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8263s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7852s + [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4561s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4127s - [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4453s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4030s + [COUNTERS] Fortran MEs ( 1 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138606099989779] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4596s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4467s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4050s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0413s for 8192 events => throughput is 1.98E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.924656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.955637e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.925228e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.998110e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602111070696] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4334s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4155s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.64E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.78E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.677131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.664552e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.687091e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.664587e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4249s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.65E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4105s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0088s for 8192 events => throughput is 9.32E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.918801e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.061883e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.134969e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.725915e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4245s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 9.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4098s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4010s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.54E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.308113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.387245e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.304031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.428088e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138606840950104] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4294s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4163s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4119s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3998s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.93E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.713633e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.805041e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.787911e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.784129e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138612400084860] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8642s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8595s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.07E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8468s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8422s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.13E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.299593e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.430485e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.634270e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.851471e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.759880e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.303444e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.744455e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.736108e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.777428e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.294616e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.990089e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.983032e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.374093e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.010691e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.364214e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.355175e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index e56bc4eee0..a9ae7d3d4f 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:23 +DATE: 2025-12-07_19:57:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8528s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8099s - [COUNTERS] Fortran MEs ( 1 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8241s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7830s + [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4512s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4425s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4012s + [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613306947967] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613340029622] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4607s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4140s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4450s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3998s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0449s for 8192 events => throughput is 1.83E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759344541868e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613340029622) differ by less than 2E-4 (2.9105554633090946e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.819635e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.862398e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.820245e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.858551e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613306947953] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613314674643] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4365s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4109s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.24E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4254s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4008s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0243s for 8192 events => throughput is 3.38E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759122497263e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613314674643) differ by less than 2E-4 (2.856767333803134e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.279259e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331632e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.279521e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.415671e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613321455189] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4168s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0150s for 8192 events => throughput is 5.46E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613321455189) differ by less than 2E-4 (2.8711516053547825e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.322301e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.293597e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.904240e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.456419e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613321455189] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4297s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.44E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4159s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4012s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0143s for 8192 events => throughput is 5.71E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613321455189) differ by less than 2E-4 (2.8711516053547825e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.558424e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.653855e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.634376e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.709920e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613321455189] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4402s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4164s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 8192 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4391s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4158s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.57E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613321455189) differ by less than 2E-4 (2.8711516053547825e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.654630e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598726e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.679375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.652168e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613294297848] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8631s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8584s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s + [COUNTERS] PROGRAM TOTAL : 0.8531s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8485s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.912312e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.807559e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.471933e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.196485e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.863402e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.643269e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.634047e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.622586e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.849540e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.611852e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.953899e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.966973e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.847641e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.630989e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.416006e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413154e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d8d6f34ca2..3dcb91e90a 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:52 +DATE: 2025-12-07_19:57:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7558s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4158s - [COUNTERS] Fortran MEs ( 1 ) : 0.3400s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7363s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4070s + [COUNTERS] Fortran MEs ( 1 ) : 0.3293s for 8192 events => throughput is 2.49E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7272s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3869s - [COUNTERS] Fortran MEs ( 1 ) : 0.3403s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7176s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3829s + [COUNTERS] Fortran MEs ( 1 ) : 0.3347s for 8192 events => throughput is 2.45E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7509s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3914s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3585s for 8192 events => throughput is 2.29E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7278s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3805s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3462s for 8192 events => throughput is 2.37E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.384792e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.432819e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.379994e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.441078e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5787s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3912s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1868s for 8192 events => throughput is 4.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5686s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3805s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1874s for 8192 events => throughput is 4.37E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.477039e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.636362e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.489628e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.647545e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4876s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3928s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0942s for 8192 events => throughput is 8.69E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4721s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3812s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0903s for 8192 events => throughput is 9.07E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.903439e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.197668e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.886830e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.288963e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3924s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0874s for 8192 events => throughput is 9.37E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4633s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3794s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0833s for 8192 events => throughput is 9.83E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.779459e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.024866e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.857066e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006961e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5118s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1188s for 8192 events => throughput is 6.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4932s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3809s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1117s for 8192 events => throughput is 7.34E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.951589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.155221e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.994069e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.067357e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8402s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8333s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.17E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0056s + [COUNTERS] PROGRAM TOTAL : 0.8208s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8140s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.26E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0054s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.930684e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.743533e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.049354e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054065e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.010359e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.883559e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.220373e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.218074e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.008910e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.825959e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.368579e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.367329e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.010569e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.882385e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.799070e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797077e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 405a8e9845..391aff4924 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,13 +1,29 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:10:26 +DATE: 2025-12-07_19:58:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7519s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s - [COUNTERS] Fortran MEs ( 1 ) : 0.3398s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7306s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4000s + [COUNTERS] Fortran MEs ( 1 ) : 0.3306s for 8192 events => throughput is 2.48E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3864s - [COUNTERS] Fortran MEs ( 1 ) : 0.3408s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7106s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3816s + [COUNTERS] Fortran MEs ( 1 ) : 0.3290s for 8192 events => throughput is 2.49E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471473453718410E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471473429998356E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3913s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3369s for 8192 events => throughput is 2.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7384s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3963s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3412s for 8192 events => throughput is 2.40E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.5745885295626039e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473429998356E-002) differ by less than 4E-4 (1.5776112904930528e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.486290e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.552337e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.478806e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.584414e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471459219682932E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4955s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3907s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1044s for 8192 events => throughput is 7.85E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4839s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3838s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0997s for 8192 events => throughput is 8.21E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.993300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.354712e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.004232e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.187687e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4415s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3925s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0486s for 8192 events => throughput is 1.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4337s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3865s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.733359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.778417e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.722443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.791323e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4378s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0452s for 8192 events => throughput is 1.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4232s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3802s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.850143e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959783e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.891286e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959640e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471471746130506E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4526s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3929s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0592s for 8192 events => throughput is 1.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4341s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3789s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0548s for 8192 events => throughput is 1.50E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.406796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.505957e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.412048e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.502716e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471471641207505E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8323s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8265s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.8220s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8162s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.67E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.479157e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.428778e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.067147e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.096519e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.047251e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.979812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.860004e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.847769e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.051348e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.975516e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.997681e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.003310e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.964172e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912379e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.785109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.787184e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index b21554372e..de210e230f 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,7 +1,23 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:10:09 +DATE: 2025-12-07_19:57:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s - [COUNTERS] Fortran MEs ( 1 ) : 0.3415s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7411s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4033s + [COUNTERS] Fortran MEs ( 1 ) : 0.3378s for 8192 events => throughput is 2.43E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3875s - [COUNTERS] Fortran MEs ( 1 ) : 0.3393s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7074s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3789s + [COUNTERS] Fortran MEs ( 1 ) : 0.3285s for 8192 events => throughput is 2.49E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486590207584E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486563309989E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7475s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3883s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3580s for 8192 events => throughput is 2.29E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.7286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3786s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3490s for 8192 events => throughput is 2.35E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765988561561e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486563309989E-002) differ by less than 2E-4 (9.602996842161815e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.359867e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413041e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.360283e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.401008e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486557993325E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486604491186E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3921s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1821s for 8192 events => throughput is 4.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5598s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3821s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1770s for 8192 events => throughput is 4.63E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486557993325E-002) differ by less than 2E-4 (9.535244149816435e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486604491186E-002) differ by less than 2E-4 (1.0127788829805695e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.570903e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.760488e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.571774e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.682658e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486496532281E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4882s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3954s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0922s for 8192 events => throughput is 8.88E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4701s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3813s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0882s for 8192 events => throughput is 9.29E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486496532281E-002) differ by less than 2E-4 (8.752016444901756e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.192817e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.564159e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.186620e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.467812e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486496532281E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4787s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3937s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0844s for 8192 events => throughput is 9.71E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3801s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0820s for 8192 events => throughput is 9.99E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486496532281E-002) differ by less than 2E-4 (8.752016444901756e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.002954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.030482e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.000380e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.036455e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486537749241E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486496532281E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5085s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1179s for 8192 events => throughput is 6.95E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.4952s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3816s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 8192 events => throughput is 7.25E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277264068074942e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486496532281E-002) differ by less than 2E-4 (8.752016444901756e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.931283e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.402217e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.899982e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.261134e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471486543087457E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8420s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8352s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.93E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0055s + [COUNTERS] PROGRAM TOTAL : 0.8235s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8168s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 6.06E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0053s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.941062e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.876181e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.043050e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045049e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.003879e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.882203e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.219422e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219448e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.007497e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.882013e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.367555e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.368444e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.012869e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.876205e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.798121e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.798438e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index fcf14d36a5..35391cd0c7 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -17,20 +33,110 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:10:42 +DATE: 2025-12-07_19:58:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.8675s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3041s - [COUNTERS] Fortran MEs ( 1 ) : 4.5634s for 8192 events => throughput is 1.80E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5649s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2911s + [COUNTERS] Fortran MEs ( 1 ) : 4.2737s for 8192 events => throughput is 1.92E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8255s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2969s - [COUNTERS] Fortran MEs ( 1 ) : 4.5287s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5522s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2895s + [COUNTERS] Fortran MEs ( 1 ) : 4.2627s for 8192 events => throughput is 1.92E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8499s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2944s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5463s for 8192 events => throughput is 1.80E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0092s + [COUNTERS] PROGRAM TOTAL : 4.7176s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.4168s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0090s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.855071e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923013e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.864869e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.903940e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.8407s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2953s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.5401s for 8192 events => throughput is 3.23E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0053s + [COUNTERS] PROGRAM TOTAL : 2.6598s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2904s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3644s for 8192 events => throughput is 3.46E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.391185e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.525548e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.371248e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.521351e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3634s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0657s for 8192 events => throughput is 7.69E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s + [COUNTERS] PROGRAM TOTAL : 1.3165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0203s for 8192 events => throughput is 8.03E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.818945e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.237261e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.888581e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.255488e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.2373s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9400s for 8192 events => throughput is 8.71E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s + [COUNTERS] PROGRAM TOTAL : 1.1968s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2901s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9045s for 8192 events => throughput is 9.06E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0021s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.864841e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.282689e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.851817e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.303857e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.5242s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2959s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.69E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0029s + [COUNTERS] PROGRAM TOTAL : 1.4449s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2915s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1508s for 8192 events => throughput is 7.12E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.755860e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.217087e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.706109e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.186312e+03 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7754s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7315s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.26E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0246s + [COUNTERS] PROGRAM TOTAL : 0.7727s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7292s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.27E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0243s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.416533e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.411560e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.462010e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.446535e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.359331e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.354461e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.449399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.490045e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.367790e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.348917e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.440795e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.485849e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.383135e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.352021e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.480569e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.484935e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 5c635cc8ef..43aa10ff33 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,13 +1,29 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:12:25 +DATE: 2025-12-07_20:00:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.8704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2988s - [COUNTERS] Fortran MEs ( 1 ) : 4.5716s for 8192 events => throughput is 1.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5787s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s + [COUNTERS] Fortran MEs ( 1 ) : 4.2836s for 8192 events => throughput is 1.91E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s - [COUNTERS] Fortran MEs ( 1 ) : 4.5284s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6332s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2889s + [COUNTERS] Fortran MEs ( 1 ) : 4.3443s for 8192 events => throughput is 1.89E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144941326459554] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144941317777332] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.7411s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4378s for 8192 events => throughput is 1.85E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0087s + [COUNTERS] PROGRAM TOTAL : 4.5693s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2939s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2670s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0084s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941326459554) differ by less than 4E-4 (4.669368411036601e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941317777332) differ by less than 4E-4 (4.66910646257368e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.908171e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.981460e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.916943e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.982886e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144937378275385] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.5212s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2931s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.68E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s + [COUNTERS] PROGRAM TOTAL : 1.4700s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1756s for 8192 events => throughput is 6.97E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.792707e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.131320e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.847129e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.104961e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5336s for 8192 events => throughput is 1.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8047s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2897s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5137s for 8192 events => throughput is 1.59E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.560155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635075e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.556326e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.641448e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7790s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2954s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4823s for 8192 events => throughput is 1.70E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s + [COUNTERS] PROGRAM TOTAL : 0.7484s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2880s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4593s for 8192 events => throughput is 1.78E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0012s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.756110e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.848597e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.758530e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.847806e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144947551388249] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.9014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6052s for 8192 events => throughput is 1.35E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 0.8662s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2891s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5757s for 8192 events => throughput is 1.42E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.375609e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.437740e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.357712e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.447840e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144804761684321] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7725s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7390s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.56E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0227s + [COUNTERS] PROGRAM TOTAL : 0.7631s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7300s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.57E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0223s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.844164e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.908697e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.016020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.019232e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.967323e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.962780e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.138637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.129305e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.960156e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.967179e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.136855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.133527e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.944572e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.951596e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.273692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.268509e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 2f61c77e8d..8c1cf0d30c 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:11:34 +DATE: 2025-12-07_19:59:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.8471s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2979s - [COUNTERS] Fortran MEs ( 1 ) : 4.5492s for 8192 events => throughput is 1.80E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5703s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2895s + [COUNTERS] Fortran MEs ( 1 ) : 4.2809s for 8192 events => throughput is 1.91E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8278s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2989s - [COUNTERS] Fortran MEs ( 1 ) : 4.5289s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5952s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2905s + [COUNTERS] Fortran MEs ( 1 ) : 4.3047s for 8192 events => throughput is 1.90E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786734542164] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786658869840] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.9193s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.6155s for 8192 events => throughput is 1.77E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0091s + [COUNTERS] PROGRAM TOTAL : 4.6567s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2891s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.3589s for 8192 events => throughput is 1.88E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0087s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786734542164) differ by less than 2E-4 (5.228634192278037e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786658869840) differ by less than 2E-4 (2.945550470201397e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.840344e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935931e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.842142e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.917711e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786651655289] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786581373942] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.7307s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2968s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4288s for 8192 events => throughput is 3.37E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s + [COUNTERS] PROGRAM TOTAL : 2.6265s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2914s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3304s for 8192 events => throughput is 3.52E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786651655289) differ by less than 2E-4 (2.7278828085286477e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786581373942) differ by less than 2E-4 (6.074483138718278e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.428088e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.600668e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.464566e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607225e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786506015422] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3474s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2970s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0479s for 8192 events => throughput is 7.82E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s + [COUNTERS] PROGRAM TOTAL : 1.3271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2937s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0311s for 8192 events => throughput is 7.95E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0023s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786506015422) differ by less than 2E-4 (1.666167759317716e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.942226e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.076326e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.692396e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.371327e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786506015422] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.2106s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9138s for 8192 events => throughput is 8.96E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s + [COUNTERS] PROGRAM TOTAL : 1.1805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2915s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8869s for 8192 events => throughput is 9.24E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0020s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786506015422) differ by less than 2E-4 (1.666167759317716e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.272414e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.496994e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.142833e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.475880e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786506015422] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.5269s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3007s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2234s for 8192 events => throughput is 6.70E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.4303s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2906s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1368s for 8192 events => throughput is 7.21E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786506015422) differ by less than 2E-4 (1.666167759317716e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.830218e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.270985e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.809509e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.139357e+03 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786716305458] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7376s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.27E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0240s + [COUNTERS] PROGRAM TOTAL : 0.7765s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7335s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.28E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0238s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.383309e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.386902e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.484069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.463660e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.409887e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.409349e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.456801e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.457426e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.362526e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.389250e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.463078e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.498527e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.357037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.353780e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.491061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.502387e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index fe6b10b3d3..738aa83744 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:52 +DATE: 2025-12-07_20:01:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.2505s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5363s - [COUNTERS] Fortran MEs ( 1 ) : 101.7141s for 8192 events => throughput is 8.05E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.0374s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5213s + [COUNTERS] Fortran MEs ( 1 ) : 97.5161s for 8192 events => throughput is 8.40E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.2069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5320s - [COUNTERS] Fortran MEs ( 1 ) : 101.6749s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.1907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5190s + [COUNTERS] Fortran MEs ( 1 ) : 97.6716s for 8192 events => throughput is 8.39E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 128.7427s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5353s - [COUNTERS] CudaCpp MEs ( 2 ) : 127.9956s for 8192 events => throughput is 6.40E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2118s + [COUNTERS] PROGRAM TOTAL : 124.7248s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5222s + [COUNTERS] CudaCpp MEs ( 2 ) : 123.9891s for 8192 events => throughput is 6.61E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2136s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.580483e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.820861e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.620995e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.876110e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 69.6189s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5284s - [COUNTERS] CudaCpp MEs ( 2 ) : 68.9781s for 8192 events => throughput is 1.19E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1125s + [COUNTERS] PROGRAM TOTAL : 66.8557s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5230s + [COUNTERS] CudaCpp MEs ( 2 ) : 66.2249s for 8192 events => throughput is 1.24E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1078s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.424482e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.490973e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.419676e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.489426e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 30.3572s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5354s - [COUNTERS] CudaCpp MEs ( 2 ) : 29.7726s for 8192 events => throughput is 2.75E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0492s + [COUNTERS] PROGRAM TOTAL : 29.2810s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5221s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.7118s for 8192 events => throughput is 2.85E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0471s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.296671e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.415114e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.296231e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.425161e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.8666s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5340s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.2902s for 8192 events => throughput is 3.12E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0424s + [COUNTERS] PROGRAM TOTAL : 25.7124s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5193s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.1522s for 8192 events => throughput is 3.26E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0409s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.796432e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.921551e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.783837e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.889857e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 27.2211s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5330s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.6390s for 8192 events => throughput is 3.08E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0491s + [COUNTERS] PROGRAM TOTAL : 27.2370s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5331s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.6555s for 8192 events => throughput is 3.07E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0484s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.322007e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534214e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.342992e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520956e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282422E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 2.0387s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0768s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6155s for 8192 events => throughput is 1.33E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.3464s + [COUNTERS] PROGRAM TOTAL : 1.9503s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9967s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6199s for 8192 events => throughput is 1.32E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3337s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.336265e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.340459e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.298842e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.309952e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.363941e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331930e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.311264e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.309277e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.338602e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331907e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.323398e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.320685e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.336359e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331413e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.336023e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.366790e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index da0706ada3..421a8ca30e 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' - -make USEBUILDDIR=1 BACKEND=cpp512y make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:46:23 +DATE: 2025-12-07_20:32:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.9219s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5367s - [COUNTERS] Fortran MEs ( 1 ) : 102.3853s for 8192 events => throughput is 8.00E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.2521s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5250s + [COUNTERS] Fortran MEs ( 1 ) : 97.7271s for 8192 events => throughput is 8.38E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.9948s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5407s - [COUNTERS] Fortran MEs ( 1 ) : 102.4541s for 8192 events => throughput is 8.00E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.7271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5268s + [COUNTERS] Fortran MEs ( 1 ) : 98.2003s for 8192 events => throughput is 8.34E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575849511111252E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575849519183833E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 116.5594s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5371s - [COUNTERS] CudaCpp MEs ( 2 ) : 115.8332s for 8192 events => throughput is 7.07E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1891s + [COUNTERS] PROGRAM TOTAL : 110.3155s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5181s + [COUNTERS] CudaCpp MEs ( 2 ) : 109.6179s for 8192 events => throughput is 7.47E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1795s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849511111252E-007) differ by less than 4E-4 (0.00013948250052009392) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849519183833E-007) differ by less than 4E-4 (0.00013948284297660152) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.535383e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.855646e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.441970e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.863475e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575845178322101E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 31.5456s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5700s - [COUNTERS] CudaCpp MEs ( 2 ) : 30.9224s for 8192 events => throughput is 2.65E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0531s + [COUNTERS] PROGRAM TOTAL : 30.2520s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5172s + [COUNTERS] CudaCpp MEs ( 2 ) : 29.6856s for 8192 events => throughput is 2.76E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0492s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.071038e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.217556e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.043650e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.219508e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 15.3844s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5370s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.8227s for 8192 events => throughput is 5.53E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0247s + [COUNTERS] PROGRAM TOTAL : 14.8408s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5177s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.2997s for 8192 events => throughput is 5.73E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0234s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.685687e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.946765e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.672269e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.909895e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.6990s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5329s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.1447s for 8192 events => throughput is 6.23E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0214s + [COUNTERS] PROGRAM TOTAL : 13.0888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5168s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.5512s for 8192 events => throughput is 6.53E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0208s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.552784e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.856877e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.581015e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.825347e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575850859831750E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.9360s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5476s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.3630s for 8192 events => throughput is 6.13E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0254s + [COUNTERS] PROGRAM TOTAL : 13.1125s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5164s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.5735s for 8192 events => throughput is 6.52E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0226s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.686443e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.056508e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.667526e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.091741e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572568120113116E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 1.5254s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0122s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2322s for 8192 events => throughput is 3.53E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2811s + [COUNTERS] PROGRAM TOTAL : 1.5032s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9885s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2327s for 8192 events => throughput is 3.52E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2820s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.547134e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548838e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.607921e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.595515e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.571279e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582993e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.601694e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.601957e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.579531e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.579638e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.607459e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.616260e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.584591e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.577593e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.996351e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000796e+04 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 972fcc6999..f1e0d12959 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:30:19 +DATE: 2025-12-07_20:17:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.1691s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5343s - [COUNTERS] Fortran MEs ( 1 ) : 101.6348s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.3478s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5227s + [COUNTERS] Fortran MEs ( 1 ) : 97.8252s for 8192 events => throughput is 8.37E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.2057s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s - [COUNTERS] Fortran MEs ( 1 ) : 101.6729s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.3161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5225s + [COUNTERS] Fortran MEs ( 1 ) : 97.7936s for 8192 events => throughput is 8.38E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561678995975E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561756513648E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 130.3996s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5377s - [COUNTERS] CudaCpp MEs ( 2 ) : 129.6472s for 8192 events => throughput is 6.32E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2147s + [COUNTERS] PROGRAM TOTAL : 121.8148s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5210s + [COUNTERS] CudaCpp MEs ( 2 ) : 121.0973s for 8192 events => throughput is 6.76E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1965s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561678995975E-007) differ by less than 2E-4 (5.417890580616813e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561756513648E-007) differ by less than 2E-4 (8.7063609655047e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.490256e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.076552e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.489525e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.089780e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561701257335E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561761967415E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 64.8540s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5288s - [COUNTERS] CudaCpp MEs ( 2 ) : 64.2213s for 8192 events => throughput is 1.28E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1039s + [COUNTERS] PROGRAM TOTAL : 62.0813s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5227s + [COUNTERS] CudaCpp MEs ( 2 ) : 61.4575s for 8192 events => throughput is 1.33E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1011s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561701257335E-007) differ by less than 2E-4 (6.3622664914220195e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561761967415E-007) differ by less than 2E-4 (8.937721895918571e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.563988e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.590729e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.529721e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.595439e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561756223343E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 28.8286s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.2496s for 8192 events => throughput is 2.90E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0463s + [COUNTERS] PROGRAM TOTAL : 27.4801s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5185s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.9179s for 8192 events => throughput is 3.04E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0437s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561756223343E-007) differ by less than 2E-4 (8.694045705581743e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.534195e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.716360e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.569719e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.695008e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561756223343E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.1574s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5395s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.5773s for 8192 events => throughput is 3.20E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0406s + [COUNTERS] PROGRAM TOTAL : 24.4725s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5198s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.9148s for 8192 events => throughput is 3.43E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0380s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561756223343E-007) differ by less than 2E-4 (8.694045705581743e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.054403e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.223981e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.039174e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.203486e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561756223343E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.7057s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5352s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.1230s for 8192 events => throughput is 3.14E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0475s + [COUNTERS] PROGRAM TOTAL : 24.7627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5224s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.1966s for 8192 events => throughput is 3.39E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0436s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561756223343E-007) differ by less than 2E-4 (8.694045705581743e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.438352e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.682293e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.447842e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.678078e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561670766515E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 1.8201s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4965s for 8192 events => throughput is 1.65E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.3105s + [COUNTERS] PROGRAM TOTAL : 1.7965s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9950s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5000s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3015s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.664884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.674381e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.607592e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.595864e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.667090e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.674334e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.595955e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.590423e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.655497e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.659790e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.622539e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623409e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.675870e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655690e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.460940e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.433230e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 7c2d5d02c8..d4c025ce64 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,16 +1,32 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:08 +DATE: 2025-12-07_20:00:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5482s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4745s - [COUNTERS] Fortran MEs ( 1 ) : 0.0736s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5307s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4597s + [COUNTERS] Fortran MEs ( 1 ) : 0.0710s for 8192 events => throughput is 1.15E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4930s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4192s - [COUNTERS] Fortran MEs ( 1 ) : 0.0739s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4914s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4177s + [COUNTERS] Fortran MEs ( 1 ) : 0.0737s for 8192 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737132] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4901s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4103s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0791s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4931s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4112s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0812s for 8192 events => throughput is 1.01E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.055904e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.062536e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.064104e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059178e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4528s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3986s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.868596e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922320e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.882630e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.905656e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4341s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4076s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 8192 events => throughput is 3.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4009s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.23E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.217719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.233889e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.250909e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.247620e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4367s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4117s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 8192 events => throughput is 3.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4257s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4018s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.50E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.377107e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.411797e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.445554e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.434584e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4456s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4100s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0350s for 8192 events => throughput is 2.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4029s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.314404e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.452369e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.349276e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.400743e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737173] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8613s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8556s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.03E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s + [COUNTERS] PROGRAM TOTAL : 0.8442s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8387s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.01E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.568159e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.598770e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.455155e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.439546e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.192502e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.131033e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.014422e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.998840e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.214633e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.121027e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.430009e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.410249e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.226812e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.125103e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646817e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.646103e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 2376b74b06..d5d6e86221 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:38 +DATE: 2025-12-07_20:01:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5325s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4601s - [COUNTERS] Fortran MEs ( 1 ) : 0.0724s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5149s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4455s + [COUNTERS] Fortran MEs ( 1 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4871s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s - [COUNTERS] Fortran MEs ( 1 ) : 0.0728s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4716s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4008s + [COUNTERS] Fortran MEs ( 1 ) : 0.0707s for 8192 events => throughput is 1.16E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313506133732837] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4749s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0747s for 8192 events => throughput is 1.10E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108850e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.120135e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.137762e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313502997679400] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4377s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4101s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0272s for 8192 events => throughput is 3.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4246s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3982s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 8192 events => throughput is 3.15E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.944992e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.069770e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.961979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.013323e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4227s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4109s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3974s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.20E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.824085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.055835e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.049332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.993518e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4225s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4090s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4130s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3999s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.355595e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.328777e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.395017e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.445137e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313505300145301] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4253s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4069s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0180s for 8192 events => throughput is 4.55E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.628365e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.699497e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.648318e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.812458e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313508404553540] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8566s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8514s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.16E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8423s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8373s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.17E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0044s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.202405e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.324486e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.296000e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.690443e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.115794e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.830726e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.024681e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.021591e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.134420e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.846993e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.104635e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.091327e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.797328e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.584025e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.751422e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.739440e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index cf138d100f..b01495a803 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,16 +1,32 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 - +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:23 +DATE: 2025-12-07_20:01:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5311s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4584s - [COUNTERS] Fortran MEs ( 1 ) : 0.0727s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4436s + [COUNTERS] Fortran MEs ( 1 ) : 0.0705s for 8192 events => throughput is 1.16E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4122s - [COUNTERS] Fortran MEs ( 1 ) : 0.0726s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4687s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3990s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504495344831] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504489066839] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4868s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0788s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4772s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4001s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0764s for 8192 events => throughput is 1.07E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344831) differ by less than 2E-4 (5.115954326839756e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504489066839) differ by less than 2E-4 (8.206504364949296e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.054873e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073109e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.059290e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.084610e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504500016025] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504500989210] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4535s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4401s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3973s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500016025) differ by less than 2E-4 (2.816402666638851e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500989210) differ by less than 2E-4 (2.337320337275628e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.896659e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920647e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.911870e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.942885e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504503723248] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4326s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4072s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4223s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3982s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0236s for 8192 events => throughput is 3.47E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504503723248) differ by less than 2E-4 (9.913980747455753e-11) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.285561e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.415834e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.331125e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.418817e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504503723248] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4323s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4216s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3983s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0229s for 8192 events => throughput is 3.58E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504503723248) differ by less than 2E-4 (9.913980747455753e-11) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.491118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.602897e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.400822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.625427e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504503723248] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4453s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4096s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 8192 events => throughput is 2.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4315s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3980s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504503723248) differ by less than 2E-4 (9.913980747455753e-11) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392779e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.511576e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.391910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.464757e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504511630270] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8562s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8507s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.04E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s + [COUNTERS] PROGRAM TOTAL : 0.8414s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8360s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.05E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0046s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.558045e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.504582e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.456934e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.486462e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.187313e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.125780e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.035767e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.017670e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.212826e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.126083e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.409792e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.408544e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.225960e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.135337e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646014e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.648390e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 2e04a004a3..cf7bda130d 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,11 +1,27 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:58:37 +DATE: 2025-12-07_20:44:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.0898s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0409s - [COUNTERS] Fortran MEs ( 1 ) : 0.0488s for 8192 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0515s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0050s + [COUNTERS] Fortran MEs ( 1 ) : 0.0466s for 8192 events => throughput is 1.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4945s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4458s - [COUNTERS] Fortran MEs ( 1 ) : 0.0487s for 8192 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4871s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4400s + [COUNTERS] Fortran MEs ( 1 ) : 0.0471s for 8192 events => throughput is 1.74E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755170] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.5064s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4538s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0521s for 8192 events => throughput is 1.57E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4975s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4434s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0536s for 8192 events => throughput is 1.53E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.654862e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.621541e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.634172e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4797s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4512s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 8192 events => throughput is 2.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4562s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4282s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0276s for 8192 events => throughput is 2.97E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.925389e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.020898e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.958081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.039546e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4709s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4533s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4481s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4311s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0166s for 8192 events => throughput is 4.95E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.831423e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.899582e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.833351e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.954409e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4705s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4537s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0165s for 8192 events => throughput is 4.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4540s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4372s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0164s for 8192 events => throughput is 5.00E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.130791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.215502e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.171570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.208800e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755179] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4789s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.30E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4541s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4305s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 8192 events => throughput is 3.52E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.370093e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.461993e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.372925e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.516317e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755196] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.8974s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8926s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.14E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8857s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8809s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.725729e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.828633e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.044433e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.229155e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.665417e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.525462e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.597159e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.561087e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.632530e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.505969e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.850879e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.878025e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.607978e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.507473e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.211181e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.208968e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index b05e5697ad..15fef37224 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:08 +DATE: 2025-12-07_20:44:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.0937s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0443s - [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0454s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9987s + [COUNTERS] Fortran MEs ( 1 ) : 0.0467s for 8192 events => throughput is 1.75E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4992s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4492s - [COUNTERS] Fortran MEs ( 1 ) : 0.0500s for 8192 events => throughput is 1.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4780s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4314s + [COUNTERS] Fortran MEs ( 1 ) : 0.0466s for 8192 events => throughput is 1.76E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160406822335140] fbridge_mode=1 [UNWEIGHT] Wrote 1653 events (found 1658 events) - [COUNTERS] PROGRAM TOTAL : 0.5029s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4535s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0491s for 8192 events => throughput is 1.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4782s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4311s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,7 +238,7 @@ diff /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubP < 5 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00 < 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.71320499473E+02 0.71320499473E+02 0.00000000000E+00 0. 1. < 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239790E+02 0.54771239790E+02 0.00000000000E+00 0. 1. -< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259682E+02 0.12609173926E+03 0.12500099485E+03 0. 0. +< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259682E+02 0.12609173926E+03 0.12500099485E+03 0. 9. < 5 1 3 3 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002893E+02 0.63925016162E+02 0.47000000000E+01 0. -1. < -5 1 3 3 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762567893E+01 0.62166723101E+02 0.47000000000E+01 0. -1. < diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index a81624efdc..54688fd8ff 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:58:52 +DATE: 2025-12-07_20:44:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.0919s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0436s - [COUNTERS] Fortran MEs ( 1 ) : 0.0483s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0457s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9992s + [COUNTERS] Fortran MEs ( 1 ) : 0.0465s for 8192 events => throughput is 1.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4974s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4479s - [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4788s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4324s + [COUNTERS] Fortran MEs ( 1 ) : 0.0464s for 8192 events => throughput is 1.77E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081963935692] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952523923] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.5020s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4502s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0513s for 8192 events => throughput is 1.60E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4309s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0504s for 8192 events => throughput is 1.63E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081963935692) differ by less than 2E-4 (2.401679322083794e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081952523923) differ by less than 2E-4 (2.3450735575636372e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.533252e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548996e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.529423e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.574930e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081964477738] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081953519970] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4812s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0285s for 8192 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4585s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4306s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0276s for 8192 events => throughput is 2.97E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964477738) differ by less than 2E-4 (2.4043680380003707e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081953519970) differ by less than 2E-4 (2.3500142498633636e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.789074e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.872481e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.799101e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.868386e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952909974] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4709s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4503s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4323s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.66E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081952909974) differ by less than 2E-4 (2.3469884924409712e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.670071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.684459e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.743283e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.766231e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952909974] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4728s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4554s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4462s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4295s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0163s for 8192 events => throughput is 5.02E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081952909974) differ by less than 2E-4 (2.3469884924409712e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.832111e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.049428e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.036692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.132068e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081981445623] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952909974] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4774s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4531s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4300s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.61E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981445623) differ by less than 2E-4 (2.4885338012481384e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081952909974) differ by less than 2E-4 (2.3469884924409712e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.244912e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.395591e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.260859e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.254350e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081952642219] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.9023s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8974s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + [COUNTERS] PROGRAM TOTAL : 0.8727s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8681s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.12E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.648200e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.826341e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.088314e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.317919e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.635192e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.514052e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.596149e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.580497e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.579204e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.478399e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.870733e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.838667e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.605252e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.495597e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.211048e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.210537e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index ee647bf095..29b039ab64 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - -make USEBUILDDIR=1 BACKEND=cuda +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:00:38 +DATE: 2025-12-07_20:46:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.7275s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3706s - [COUNTERS] Fortran MEs ( 1 ) : 2.3569s for 8192 events => throughput is 3.48E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6182s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3510s + [COUNTERS] Fortran MEs ( 1 ) : 2.2672s for 8192 events => throughput is 3.61E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7259s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3684s - [COUNTERS] Fortran MEs ( 1 ) : 2.3575s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6517s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3519s + [COUNTERS] Fortran MEs ( 1 ) : 2.2998s for 8192 events => throughput is 3.56E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8149s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3695s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4402s for 8192 events => throughput is 3.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7266s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3520s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3695s for 8192 events => throughput is 3.46E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0051s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.441343e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.578271e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.445366e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.580927e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728610E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.7137s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3713s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3396s for 8192 events => throughput is 6.12E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6262s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3535s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2698s for 8192 events => throughput is 6.45E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.351156e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.645556e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.406951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.658012e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9625s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5902s for 8192 events => throughput is 1.39E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s + [COUNTERS] PROGRAM TOTAL : 0.9194s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3539s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5640s for 8192 events => throughput is 1.45E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.435538e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.496117e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.436593e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.477496e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9044s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3692s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5338s for 8192 events => throughput is 1.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8666s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5130s for 8192 events => throughput is 1.60E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.541883e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.657616e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.588675e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.657839e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0751s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3693s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7040s for 8192 events => throughput is 1.16E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s + [COUNTERS] PROGRAM TOTAL : 1.0172s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3524s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6631s for 8192 events => throughput is 1.24E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0017s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.193272e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264435e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.191231e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265815e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8448s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8136s + [COUNTERS] PROGRAM TOTAL : 0.8251s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7945s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.56E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0187s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0181s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.695448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.743287e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.925847e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.914289e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.997799e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.978614e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.170285e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.164121e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.983419e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.975272e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.128334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.125574e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.982511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.939204e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.328429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.324436e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 1cc58a2dd1..704e45ff0a 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,13 +1,29 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:02:03 +DATE: 2025-12-07_20:47:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.7018s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3625s - [COUNTERS] Fortran MEs ( 1 ) : 2.3393s for 8192 events => throughput is 3.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6030s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3498s + [COUNTERS] Fortran MEs ( 1 ) : 2.2533s for 8192 events => throughput is 3.64E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7141s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3681s - [COUNTERS] Fortran MEs ( 1 ) : 2.3460s for 8192 events => throughput is 3.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6054s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3521s + [COUNTERS] Fortran MEs ( 1 ) : 2.2533s for 8192 events => throughput is 3.64E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381686359952968E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381686370315886E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7333s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3691s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3595s for 8192 events => throughput is 3.47E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s + [COUNTERS] PROGRAM TOTAL : 2.6715s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3570s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3100s for 8192 events => throughput is 3.55E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0046s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686359952968E-007) differ by less than 4E-4 (9.949675585652074e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686370315886E-007) differ by less than 4E-4 (9.951032315935748e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.581994e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.730379e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.595398e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.719030e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381671483253128E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0796s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3702s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7079s for 8192 events => throughput is 1.16E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 1.0165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3536s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6615s for 8192 events => throughput is 1.24E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.209114e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.249842e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.211724e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.256574e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6741s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3011s for 8192 events => throughput is 2.72E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s + [COUNTERS] PROGRAM TOTAL : 0.6413s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2881s for 8192 events => throughput is 2.84E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.778595e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.895411e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.785996e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856694e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6455s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3705s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2742s for 8192 events => throughput is 2.99E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.6273s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3575s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2689s for 8192 events => throughput is 3.05E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.038472e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.190545e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.060001e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.203070e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381686320975603E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.7218s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3694s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3514s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3504s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3291s for 8192 events => throughput is 2.49E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.367267e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.505321e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.356404e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.508454e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381615491789429E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8351s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8093s + [COUNTERS] PROGRAM TOTAL : 0.8157s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7900s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0182s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0180s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138586e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147094e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.179241e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.177156e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.224464e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221684e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.249728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.249997e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.225890e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.220968e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.250555e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.248034e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.220840e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.218387e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.651149e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.638557e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 2ca786964c..7a069cfd45 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,16 +1,32 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 + +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:01:20 +DATE: 2025-12-07_20:47:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.7267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3648s - [COUNTERS] Fortran MEs ( 1 ) : 2.3619s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6095s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3462s + [COUNTERS] Fortran MEs ( 1 ) : 2.2633s for 8192 events => throughput is 3.62E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7387s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3721s - [COUNTERS] Fortran MEs ( 1 ) : 2.3666s for 8192 events => throughput is 3.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6142s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s + [COUNTERS] Fortran MEs ( 1 ) : 2.2600s for 8192 events => throughput is 3.62E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608764955655E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608794346840E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3762s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4897s for 8192 events => throughput is 3.29E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0052s + [COUNTERS] PROGRAM TOTAL : 2.7350s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3535s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3764s for 8192 events => throughput is 3.45E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293208715966e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608794346840E-007) differ by less than 2E-4 (2.0533499234254293e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.387716e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.555339e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.386658e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.545382e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608686521600E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608713473394E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.6908s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3716s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3164s for 8192 events => throughput is 6.22E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6607s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3529s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3051s for 8192 events => throughput is 6.28E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164130343642e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608713473394E-007) differ by less than 2E-4 (2.159230705345294e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.591306e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.856471e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.584653e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.816759e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608835735750E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9663s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3722s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5924s for 8192 events => throughput is 1.38E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s + [COUNTERS] PROGRAM TOTAL : 0.9222s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3543s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5663s for 8192 events => throughput is 1.45E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608835735750E-007) differ by less than 2E-4 (1.9991629129911814e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.420848e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.485826e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.429579e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.490431e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608835735750E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9022s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5284s for 8192 events => throughput is 1.55E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s + [COUNTERS] PROGRAM TOTAL : 0.8569s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3537s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5018s for 8192 events => throughput is 1.63E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608835735750E-007) differ by less than 2E-4 (1.9991629129911814e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.602337e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.691863e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.607376e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.684618e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608835735750E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0826s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7085s for 8192 events => throughput is 1.16E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s + [COUNTERS] PROGRAM TOTAL : 1.0068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3532s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6518s for 8192 events => throughput is 1.26E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0018s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608835735750E-007) differ by less than 2E-4 (1.9991629129911814e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.176853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273858e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.176159e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.267164e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608867927968E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8465s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.53E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0188s + [COUNTERS] PROGRAM TOTAL : 0.8296s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7985s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0126s for 8192 events => throughput is 6.50E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0184s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.668728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.754864e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.889186e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.949659e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.020522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.978286e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.111985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.109348e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.014502e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007368e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.139379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.123267e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.980651e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.977519e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.329147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.325536e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 869ed226f5..2a1d8675bb 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:56 +DATE: 2025-12-07_20:45:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.7024s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6938s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6699s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6617s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.93E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4169s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4110s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4025s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.70E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4378s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4280s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4168s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4077s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0088s for 8192 events => throughput is 9.34E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.191014e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.189685e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.282907e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.293021e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4316s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4266s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4150s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4101s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.81E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.860989e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.849944e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.909431e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.913941e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4296s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4263s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.79E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4117s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.88E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.006727e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856827e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.109595e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.333019e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4313s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4281s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.87E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4132s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4102s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.16E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.041656e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.205523e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.245400e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497045e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4344s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4307s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.48E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4100s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.73E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.847128e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.908995e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.978037e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.094895e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426103] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8657s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8616s + [COUNTERS] PROGRAM TOTAL : 0.8467s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8428s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0032s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.369013e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.521671e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.148244e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.110249e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.850459e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.666166e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.711716e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.666981e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.810975e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.645406e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.845473e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.832520e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.786901e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.677954e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.505596e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.504851e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 290a3c86d1..c6e31524a0 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:00:24 +DATE: 2025-12-07_20:46:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6996s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6911s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6710s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6625s + [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.61E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4259s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4174s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4125s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4043s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446496609361] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4354s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4265s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.52E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4180s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 8192 events => throughput is 9.90E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.988834e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.949961e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.001217e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.961897e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446369440458] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4277s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.97E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4112s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.88E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.265266e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186369e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.237148e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.260058e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4151s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4129s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.37E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.015677e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.026482e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.231737e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.228371e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4273s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.39E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4136s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4116s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.67E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.231045e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.232843e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.443837e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.468910e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449447031649013] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4294s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4119s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.87E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.280248e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.435101e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.772169e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.760333e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449447192383194] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8794s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8751s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s + [COUNTERS] PROGRAM TOTAL : 0.8470s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8433s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0030s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.023525e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.212411e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.499953e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.160545e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.571654e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.558916e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.545216e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.512624e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.440681e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.627183e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.320302e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.275419e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.015605e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.136216e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.300602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317583e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 54eb3e1a6f..80b5453e4e 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:00:10 +DATE: 2025-12-07_20:45:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6912s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6825s - [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6695s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6611s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.77E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4180s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4130s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4048s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.97E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453136999483] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4348s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4250s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4182s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0093s for 8192 events => throughput is 8.80E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ by less than 2E-4 (2.6846654010981297e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453136999483) differ by less than 2E-4 (2.6061991231784987e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.020488e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.045832e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.158136e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.086288e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453156715223] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4307s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4256s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.90E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ by less than 2E-4 (2.6846654010981297e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453156715223) differ by less than 2E-4 (2.6709482181530575e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.944164e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932268e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.990329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.993181e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453230280987] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4315s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4111s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.94E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453230280987) differ by less than 2E-4 (2.9125478473446265e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.282930e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.245395e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.189855e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.658082e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453230280987] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4314s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4151s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.11E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453230280987) differ by less than 2E-4 (2.9125478473446265e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.114512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.431556e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.432567e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.645571e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453230280987] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4300s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4264s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4170s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.54E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453230280987) differ by less than 2E-4 (2.9125478473446265e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.966860e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.967306e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.100849e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.313640e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453231638185] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8619s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s + [COUNTERS] PROGRAM TOTAL : 0.8523s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8482s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.16E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0034s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.132456e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.083540e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.476431e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.139339e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.825751e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.664841e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.688447e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.662022e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.845505e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.666501e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.878507e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.813794e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.760833e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.662763e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.514420e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.513804e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 79dba98821..99ec0316f0 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx - -make USEBUILDDIR=1 BACKEND=cuda +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:12 +DATE: 2025-12-07_20:44:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8640s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8203s - [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8347s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7923s + [COUNTERS] Fortran MEs ( 1 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4586s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s - [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4433s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4014s + [COUNTERS] Fortran MEs ( 1 ) : 0.0419s for 8192 events => throughput is 1.96E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846964] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0455s for 8192 events => throughput is 1.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4533s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4083s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.837387e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.864532e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.822913e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.892815e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4480s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 8192 events => throughput is 3.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4449s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4202s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.36E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.267707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.197686e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.222778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.292603e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4349s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4186s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4298s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4139s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.26E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.198106e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.143861e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.028037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.202767e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4391s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4230s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4262s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4111s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.57E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.463972e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.497654e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.474487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.459343e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4521s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4278s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4305s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0220s for 8192 events => throughput is 3.72E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.505694e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.552612e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.538808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.682563e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8667s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8617s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + [COUNTERS] PROGRAM TOTAL : 0.8507s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8460s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0040s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.923790e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.942171e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.174225e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.309606e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.777101e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.665767e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.655868e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.654682e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.765814e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.668761e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.993174e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.978602e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.751468e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.645522e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.413877e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413274e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 5dfa48ff39..f8e65d3aa1 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,7 +1,23 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda @@ -9,28 +25,118 @@ make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' - make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:42 +DATE: 2025-12-07_20:45:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8523s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8088s - [COUNTERS] Fortran MEs ( 1 ) : 0.0435s for 8192 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8284s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7861s + [COUNTERS] Fortran MEs ( 1 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4551s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4119s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4427s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4005s + [COUNTERS] Fortran MEs ( 1 ) : 0.0421s for 8192 events => throughput is 1.94E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641906072918047] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4653s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4221s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4552s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4118s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.918004e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.981049e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.936998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.978435e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641902189470080] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4377s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.66E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4236s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4065s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0169s for 8192 events => throughput is 4.86E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.699516e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.725379e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.722220e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.863154e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4310s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4214s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.72E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4162s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4067s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0093s for 8192 events => throughput is 8.81E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.856695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.133406e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.157334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.099375e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4281s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4187s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 8.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4154s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4064s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0087s for 8192 events => throughput is 9.45E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.452792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.430242e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.496015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.531645e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641906399820272] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4332s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4204s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0126s for 8192 events => throughput is 6.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4194s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4075s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.06E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.751797e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.785117e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.843654e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.913260e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911000118164] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8690s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8644s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.06E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s + [COUNTERS] PROGRAM TOTAL : 0.8501s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8455s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.07E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0038s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.158414e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.408767e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.781779e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.984120e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.387147e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.335411e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.660863e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.663118e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.340902e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.355492e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.882663e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904182e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.999883e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.024954e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.181537e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.199708e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 4c27cac81e..36bdee1847 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:27 +DATE: 2025-12-07_20:45:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8565s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8130s - [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8285s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7866s + [COUNTERS] Fortran MEs ( 1 ) : 0.0419s for 8192 events => throughput is 1.96E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4587s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s - [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4406s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3987s + [COUNTERS] Fortran MEs ( 1 ) : 0.0419s for 8192 events => throughput is 1.96E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912952585443] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4690s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4546s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4092s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0450s for 8192 events => throughput is 1.82E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912952585443) differ by less than 2E-4 (2.815153865576292e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.793421e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.835713e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.799600e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850999e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912934246548] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4483s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4223s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 8192 events => throughput is 3.20E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4329s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912934246548) differ by less than 2E-4 (2.7740738595127823e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.273502e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.253211e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.281864e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367883e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912966143884] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4382s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4219s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4258s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4101s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0153s for 8192 events => throughput is 5.35E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912966143884) differ by less than 2E-4 (2.8455254152959242e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.329657e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.316201e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.307405e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.360584e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912966143884] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4397s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4242s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.68E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912966143884) differ by less than 2E-4 (2.8455254152959242e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.584798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.636337e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.705746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.686318e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912966143884] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4435s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4205s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4288s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4066s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0218s for 8192 events => throughput is 3.76E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912966143884) differ by less than 2E-4 (2.8455254152959242e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.605692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.657824e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.652839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.654498e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912949951454] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8669s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8620s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.17E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + [COUNTERS] PROGRAM TOTAL : 0.8656s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8608s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0040s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.727760e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.806470e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.049471e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.450723e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.736425e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.639708e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.634947e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.654956e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.745425e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.621877e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.997146e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.978718e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.718374e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.652110e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.415073e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.410874e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling index 1608b91cb1..75cd5e63ac 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:39:36 +DATE: 2025-12-07_17:56:28 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.365880e+06 1 256 -4.932658e+06 2 256 -1.130330e+07 4 256 -2.221065e+07 8 256 -3.796917e+07 16 256 -8.093742e+07 32 256 -1.438543e+08 64 256 -2.092652e+08 128 256 -2.586706e+08 256 256 -3.166572e+08 512 256 -3.450925e+08 1024 256 +2.828292e+06 1 256 +6.114090e+06 2 256 +1.163359e+07 4 256 +2.245762e+07 8 256 +3.962197e+07 16 256 +8.583493e+07 32 256 +1.486765e+08 64 256 +1.985410e+08 128 256 +2.543566e+08 256 256 +3.191398e+08 512 256 +3.589952e+08 1024 256 ### GPU: scaling test 32 -3.615411e+05 1 32 -7.956340e+05 2 32 -1.534533e+06 4 32 -2.896550e+06 8 32 -5.416499e+06 16 32 -1.086184e+07 32 32 -2.239377e+07 64 32 -4.040723e+07 128 32 -8.109125e+07 256 32 -1.501315e+08 512 32 -2.161406e+08 1024 32 -2.736516e+08 2048 32 -3.294400e+08 4096 32 -3.666924e+08 8192 32 +3.460732e+05 1 32 +8.262329e+05 2 32 +1.484385e+06 4 32 +2.893636e+06 8 32 +6.066567e+06 16 32 +1.002968e+07 32 32 +2.349297e+07 64 32 +3.536186e+07 128 32 +8.757002e+07 256 32 +1.583867e+08 512 32 +2.115170e+08 1024 32 +2.787414e+08 2048 32 +3.346815e+08 4096 32 +3.729437e+08 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.112163e+06 1 256 -1.095778e+06 2 256 -1.085622e+06 4 256 +1.031165e+06 1 256 +1.102961e+06 2 256 +1.103733e+06 4 256 ### CPU: scaling test 32 -9.838283e+05 1 32 -1.009336e+06 2 32 -1.104848e+06 4 32 +1.057187e+06 1 32 +1.090959e+06 2 32 +1.105583e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.791676e+06 1 256 -1.843126e+06 2 256 -1.850216e+06 4 256 +1.891169e+06 1 256 +1.848496e+06 2 256 +1.743854e+06 4 256 ### CPU: scaling test 32 -1.835283e+06 1 32 -1.487162e+06 2 32 -1.478777e+06 4 32 +1.596488e+06 1 32 +1.520335e+06 2 32 +1.732469e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.691677e+06 1 256 -2.725347e+06 2 256 -2.679688e+06 4 256 +2.232591e+06 1 256 +2.714884e+06 2 256 +2.690560e+06 4 256 ### CPU: scaling test 32 -2.224230e+06 1 32 -2.558465e+06 2 32 -2.649774e+06 4 32 +2.373535e+06 1 32 +2.562870e+06 2 32 +2.650872e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.781551e+06 1 256 -2.448941e+06 2 256 -2.756282e+06 4 256 +2.776181e+06 1 256 +2.564552e+06 2 256 +2.732605e+06 4 256 ### CPU: scaling test 32 -2.377238e+06 1 32 -2.626719e+06 2 32 -2.722014e+06 4 32 +2.251618e+06 1 32 +1.672285e+06 2 32 +2.326723e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.040101e+06 1 256 -2.059277e+06 2 256 -2.194331e+06 4 256 +2.043455e+06 1 256 +2.033861e+06 2 256 +2.149784e+06 4 256 ### CPU: scaling test 32 -1.410251e+06 1 32 -1.626347e+06 2 32 -1.877466e+06 4 32 +2.679618e+06 1 32 +1.606789e+06 2 32 +1.849230e+06 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 6b63860e97..7ea11da7ec 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:13:43 +DATE: 2025-12-07_17:31:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.456825e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.020579e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.872827e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.448256e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.095942e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.924818e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.693291 sec - 2,729,119,040 cycles # 2.827 GHz - 4,039,185,150 instructions # 1.48 insn per cycle - 1.043410313 seconds time elapsed +TOTAL : 0.779644 sec + 2,832,606,212 cycles # 2.888 GHz + 4,254,803,118 instructions # 1.50 insn per cycle + 1.394928839 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.019940e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187870e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.187870e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.047604e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.219439e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219439e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.588033 sec - 19,038,044,386 cycles # 2.888 GHz - 46,485,585,356 instructions # 2.44 insn per cycle - 6.596061286 seconds time elapsed +TOTAL : 6.411794 sec + 19,014,240,782 cycles # 2.964 GHz + 46,485,315,191 instructions # 2.44 insn per cycle + 6.416861168 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.557129e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.030035e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.030035e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.598686e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.089132e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.089132e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.460811 sec - 12,939,620,485 cycles # 2.898 GHz - 31,810,901,247 instructions # 2.46 insn per cycle - 4.469139042 seconds time elapsed +TOTAL : 4.346013 sec + 12,961,637,078 cycles # 2.979 GHz + 31,812,423,686 instructions # 2.45 insn per cycle + 4.352494980 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.933537e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.681631e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.681631e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.995383e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.769392e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.769392e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.671840 sec - 10,104,892,452 cycles # 2.749 GHz - 19,727,697,375 instructions # 1.95 insn per cycle - 3.679095535 seconds time elapsed +TOTAL : 3.556154 sec + 10,091,928,187 cycles # 2.835 GHz + 19,729,979,199 instructions # 1.96 insn per cycle + 3.561316676 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.989488e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.781185e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781185e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.071101e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.895575e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.895575e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.576826 sec - 9,900,381,139 cycles # 2.765 GHz - 19,380,047,753 instructions # 1.96 insn per cycle - 3.585735108 seconds time elapsed +TOTAL : 3.437536 sec + 9,847,578,789 cycles # 2.862 GHz + 19,380,355,138 instructions # 1.97 insn per cycle + 3.442447176 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.671348e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.193135e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.193135e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.773261e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.352997e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.352997e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.184170 sec - 8,626,596,296 cycles # 2.060 GHz - 15,802,085,882 instructions # 1.83 insn per cycle - 4.189889070 seconds time elapsed +TOTAL : 3.952172 sec + 8,636,738,592 cycles # 2.183 GHz + 15,800,904,624 instructions # 1.83 insn per cycle + 3.957156027 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 7af659d91e..80b060593d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:27:21 +DATE: 2025-12-07_18:47:15 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.684743e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.912007e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.912007e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.573245e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.608064e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.608064e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.246839 sec - 7,225,562,469 cycles # 2.863 GHz - 12,863,341,750 instructions # 1.78 insn per cycle - 2.580507454 seconds time elapsed +TOTAL : 2.279780 sec + 7,407,168,544 cycles # 2.899 GHz + 12,994,703,778 instructions # 1.75 insn per cycle + 2.611237573 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.838576e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.140129e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.140129e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.010225e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.170408e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.170408e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 7.023062 sec - 20,241,810,963 cycles # 2.880 GHz - 46,692,050,581 instructions # 2.31 insn per cycle - 7.030271965 seconds time elapsed +TOTAL : 6.836610 sec + 20,265,690,531 cycles # 2.962 GHz + 46,694,932,139 instructions # 2.30 insn per cycle + 6.843290951 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.470152e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890657e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890657e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.514486e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.952586e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.952586e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.909808 sec - 14,179,876,666 cycles # 2.885 GHz - 32,595,242,292 instructions # 2.30 insn per cycle - 4.916954834 seconds time elapsed +TOTAL : 4.772908 sec + 14,246,240,749 cycles # 2.981 GHz + 32,594,540,614 instructions # 2.29 insn per cycle + 4.779610612 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.819567e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.481129e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.481129e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.870875e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.538776e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.538776e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.095092 sec - 11,322,720,907 cycles # 2.761 GHz - 21,029,920,385 instructions # 1.86 insn per cycle - 4.102381100 seconds time elapsed +TOTAL : 3.987125 sec + 11,388,845,294 cycles # 2.852 GHz + 21,028,692,593 instructions # 1.85 insn per cycle + 3.993809857 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.870930e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.557290e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.557290e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.931559e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.638725e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.638725e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.995093 sec - 11,100,469,150 cycles # 2.774 GHz - 20,681,913,151 instructions # 1.86 insn per cycle - 4.002396442 seconds time elapsed +TOTAL : 3.876798 sec + 11,134,310,600 cycles # 2.868 GHz + 20,669,878,356 instructions # 1.86 insn per cycle + 3.883603619 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.582678e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.044225e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.044225e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.594991e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.060796e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.060796e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.613845 sec - 9,931,301,323 cycles # 2.150 GHz - 16,893,944,858 instructions # 1.70 insn per cycle - 4.620613606 seconds time elapsed +TOTAL : 4.584098 sec + 9,957,038,756 cycles # 2.180 GHz + 16,894,368,379 instructions # 1.70 insn per cycle + 4.590842064 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 26a3ddb0c7..d3c5f30319 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:42:49 +DATE: 2025-12-07_19:02:10 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.197440e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.038954e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.882278e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.236250e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.998602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.861106e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.377431 sec - 4,700,779,648 cycles # 2.862 GHz - 7,103,932,908 instructions # 1.51 insn per cycle - 1.699431401 seconds time elapsed +TOTAL : 1.352434 sec + 4,735,937,915 cycles # 2.934 GHz + 7,191,032,078 instructions # 1.52 insn per cycle + 1.670156228 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.015955e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183181e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183181e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.050764e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.224069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.224069e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.982657 sec - 20,123,225,872 cycles # 2.880 GHz - 46,589,016,073 instructions # 2.32 insn per cycle - 6.988225439 seconds time elapsed +TOTAL : 6.756956 sec + 20,147,730,410 cycles # 2.980 GHz + 46,588,298,090 instructions # 2.31 insn per cycle + 6.762343959 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.538846e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.003610e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.003610e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.591799e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.078459e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.078459e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.882603 sec - 14,026,556,551 cycles # 2.870 GHz - 31,813,873,682 instructions # 2.27 insn per cycle - 4.888198902 seconds time elapsed +TOTAL : 4.726427 sec + 14,053,068,885 cycles # 2.971 GHz + 31,813,655,279 instructions # 2.26 insn per cycle + 4.731607799 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.898151e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.633048e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.633048e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.999935e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.793645e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.793645e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.110798 sec - 11,260,535,150 cycles # 2.739 GHz - 19,633,224,823 instructions # 1.74 insn per cycle - 4.116583823 seconds time elapsed +TOTAL : 3.912548 sec + 11,235,849,310 cycles # 2.869 GHz + 19,631,317,818 instructions # 1.75 insn per cycle + 3.917823386 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.970956e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.746513e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.746513e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.015021e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.818185e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.818185e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.988212 sec - 10,998,193,863 cycles # 2.755 GHz - 19,082,144,667 instructions # 1.74 insn per cycle - 3.993745104 seconds time elapsed +TOTAL : 3.904634 sec + 11,044,903,653 cycles # 2.829 GHz + 19,073,131,881 instructions # 1.73 insn per cycle + 3.909957111 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.672146e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.193639e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.193639e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.750528e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.315643e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.315643e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.562173 sec - 9,723,899,863 cycles # 2.130 GHz - 15,503,539,741 instructions # 1.59 insn per cycle - 4.567607097 seconds time elapsed +TOTAL : 4.368511 sec + 9,754,660,947 cycles # 2.231 GHz + 15,502,098,574 instructions # 1.59 insn per cycle + 4.373796354 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 6fb7bec229..f4ac25e43e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:39:22 +DATE: 2025-12-07_18:58:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.211048e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.057687e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.886821e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.290108e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.021504e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.888415e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.007194 sec - 3,630,386,848 cycles # 2.852 GHz - 7,085,182,200 instructions # 1.95 insn per cycle - 1.329367848 seconds time elapsed +TOTAL : 0.989078 sec + 3,662,420,928 cycles # 2.936 GHz + 7,078,360,798 instructions # 1.93 insn per cycle + 1.306216423 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.609025e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108811e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108811e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.050387e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.224773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.224773e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.966326 sec - 20,072,455,939 cycles # 2.880 GHz - 46,487,974,788 instructions # 2.32 insn per cycle - 6.971901471 seconds time elapsed +TOTAL : 6.396138 sec + 19,070,234,656 cycles # 2.980 GHz + 46,484,149,549 instructions # 2.44 insn per cycle + 6.401528097 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.534636e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.011512e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.011512e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.596634e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.083605e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.083605e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.522016 sec - 13,022,549,779 cycles # 2.877 GHz - 31,812,825,471 instructions # 2.44 insn per cycle - 4.527552219 seconds time elapsed +TOTAL : 4.347884 sec + 12,991,527,975 cycles # 2.985 GHz + 31,810,788,836 instructions # 2.45 insn per cycle + 4.353235387 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.935285e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.687999e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.687999e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.011038e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.809141e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.809141e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.667443 sec - 10,100,998,652 cycles # 2.751 GHz - 19,728,236,183 instructions # 1.95 insn per cycle - 3.673057057 seconds time elapsed +TOTAL : 3.532089 sec + 10,106,621,871 cycles # 2.858 GHz + 19,727,617,278 instructions # 1.95 insn per cycle + 3.537439455 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.992051e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.787343e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.787343e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.058582e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.887690e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.887690e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.571290 sec - 9,885,962,165 cycles # 2.765 GHz - 19,369,829,317 instructions # 1.96 insn per cycle - 3.576876880 seconds time elapsed +TOTAL : 3.457150 sec + 9,906,100,163 cycles # 2.862 GHz + 19,379,885,778 instructions # 1.96 insn per cycle + 3.462456611 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.693244e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.231997e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.231997e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.742524e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.303662e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.303662e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.132357 sec - 8,622,523,625 cycles # 2.084 GHz - 15,800,710,236 instructions # 1.83 insn per cycle - 4.137999929 seconds time elapsed +TOTAL : 4.018381 sec + 8,637,190,113 cycles # 2.147 GHz + 15,800,690,263 instructions # 1.83 insn per cycle + 4.023793810 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 93b11c3b79..1a4179db3f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:35:54 +DATE: 2025-12-07_18:55:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.941086e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.084749e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.895980e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.083802e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.986419e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.835576e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.918291 sec - 6,252,733,621 cycles # 2.863 GHz - 11,379,391,021 instructions # 1.82 insn per cycle - 2.240220236 seconds time elapsed +TOTAL : 1.877108 sec + 6,301,961,688 cycles # 2.947 GHz + 11,453,259,515 instructions # 1.82 insn per cycle + 2.194535556 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.013186e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180354e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.053345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.227087e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.227087e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.629592 sec - 19,062,117,259 cycles # 2.874 GHz - 46,484,682,805 instructions # 2.44 insn per cycle - 6.635147352 seconds time elapsed +TOTAL : 6.377614 sec + 19,046,187,381 cycles # 2.985 GHz + 46,484,805,427 instructions # 2.44 insn per cycle + 6.383126199 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.545386e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.014583e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.014583e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.590196e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.074245e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.074245e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.493129 sec - 12,958,309,518 cycles # 2.881 GHz - 31,813,104,162 instructions # 2.46 insn per cycle - 4.498775995 seconds time elapsed +TOTAL : 4.368053 sec + 12,966,028,980 cycles # 2.965 GHz + 31,811,250,177 instructions # 2.45 insn per cycle + 4.373267395 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.912965e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.656557e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.656557e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.943596e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.693965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.693965e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.707178 sec - 10,138,189,210 cycles # 2.732 GHz - 19,728,296,128 instructions # 1.95 insn per cycle - 3.712878607 seconds time elapsed +TOTAL : 3.647431 sec + 10,132,161,863 cycles # 2.775 GHz + 19,731,344,136 instructions # 1.95 insn per cycle + 3.652831940 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.985253e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.770354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.770354e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.040167e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.853056e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.853056e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.582064 sec - 9,886,774,092 cycles # 2.757 GHz - 19,370,169,431 instructions # 1.96 insn per cycle - 3.587619730 seconds time elapsed +TOTAL : 3.486697 sec + 9,879,867,582 cycles # 2.831 GHz + 19,370,142,954 instructions # 1.96 insn per cycle + 3.491906168 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.686193e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.230105e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.230105e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.743662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.307246e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.307246e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.149789 sec - 8,677,655,368 cycles # 2.089 GHz - 15,800,773,198 instructions # 1.82 insn per cycle - 4.155474285 seconds time elapsed +TOTAL : 4.013127 sec + 8,683,983,866 cycles # 2.162 GHz + 15,804,148,886 instructions # 1.82 insn per cycle + 4.018426427 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 0a4631bfc6..8735fba0bf 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:14:20 +DATE: 2025-12-07_17:32:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.305792e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.022345e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.904091e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.720645e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.115346e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.954732e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.693566 sec - 2,710,557,615 cycles # 2.827 GHz - 4,083,363,883 instructions # 1.51 insn per cycle - 1.021549892 seconds time elapsed +TOTAL : 0.671172 sec + 2,729,060,395 cycles # 2.926 GHz + 4,102,381,427 instructions # 1.50 insn per cycle + 0.993507551 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.017450e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184170e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.184170e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.048082e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.222737e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.222737e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.603628 sec - 19,045,137,786 cycles # 2.882 GHz - 46,458,572,507 instructions # 2.44 insn per cycle - 6.609045751 seconds time elapsed +TOTAL : 6.406847 sec + 19,068,257,968 cycles # 2.974 GHz + 46,458,750,757 instructions # 2.44 insn per cycle + 6.411892946 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.561588e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.042161e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.042161e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.594610e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.080938e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.080938e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.447754 sec - 12,946,444,589 cycles # 2.908 GHz - 31,786,052,376 instructions # 2.46 insn per cycle - 4.453579330 seconds time elapsed +TOTAL : 4.355635 sec + 12,906,412,880 cycles # 2.960 GHz + 31,785,278,738 instructions # 2.46 insn per cycle + 4.360890055 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1659) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.943406e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.706594e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.706594e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.995984e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.773644e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.773644e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.652290 sec - 10,144,241,352 cycles # 2.774 GHz - 19,717,545,087 instructions # 1.94 insn per cycle - 3.657857806 seconds time elapsed +TOTAL : 3.557125 sec + 10,095,981,818 cycles # 2.835 GHz + 19,718,699,261 instructions # 1.95 insn per cycle + 3.562205026 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.997101e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.794298e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.794298e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.059878e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.877856e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.877856e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.563735 sec - 9,854,038,944 cycles # 2.762 GHz - 19,385,201,008 instructions # 1.97 insn per cycle - 3.569441170 seconds time elapsed +TOTAL : 3.455708 sec + 9,846,328,132 cycles # 2.846 GHz + 19,375,603,015 instructions # 1.97 insn per cycle + 3.460815714 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1640) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.736214e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.301251e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.301251e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.824405e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.430959e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.430959e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.039858 sec - 8,445,670,568 cycles # 2.088 GHz - 15,663,059,460 instructions # 1.85 insn per cycle - 4.045505615 seconds time elapsed +TOTAL : 3.852810 sec + 8,441,958,170 cycles # 2.189 GHz + 15,662,548,315 instructions # 1.86 insn per cycle + 3.857925799 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 845) (512y: 154) (512z: 1244) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 9b568d27dc..e364fa605e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:16:29 +DATE: 2025-12-07_18:36:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.176996e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.012495e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.891048e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.890300e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.928162e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.816710e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.694489 sec - 2,721,882,133 cycles # 2.827 GHz - 4,075,193,578 instructions # 1.50 insn per cycle - 1.025946647 seconds time elapsed +TOTAL : 0.692679 sec + 2,787,364,694 cycles # 2.904 GHz + 4,176,559,202 instructions # 1.50 insn per cycle + 1.019493467 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.542747e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.967302e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.967302e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.572900e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.003447e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003447e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.494551 sec - 12,989,678,815 cycles # 2.889 GHz - 32,646,175,174 instructions # 2.51 insn per cycle - 4.499744847 seconds time elapsed +TOTAL : 4.411365 sec + 13,017,055,620 cycles # 2.948 GHz + 32,648,682,426 instructions # 2.51 insn per cycle + 4.416890047 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 274) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.896999e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.655930e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.655930e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.951491e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.740768e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.740768e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.740364 sec - 10,735,813,544 cycles # 2.867 GHz - 24,899,817,001 instructions # 2.32 insn per cycle - 3.745821170 seconds time elapsed +TOTAL : 3.634756 sec + 10,786,620,150 cycles # 2.964 GHz + 24,900,148,433 instructions # 2.31 insn per cycle + 3.640248648 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1252) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.183902e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.196051e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.196051e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.253770e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.306871e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.306871e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.294762 sec - 9,147,621,247 cycles # 2.773 GHz - 16,945,065,636 instructions # 1.85 insn per cycle - 3.300349072 seconds time elapsed +TOTAL : 3.196107 sec + 9,161,638,559 cycles # 2.862 GHz + 16,946,627,953 instructions # 1.85 insn per cycle + 3.201549279 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1609) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.267329e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.347814e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.347814e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.318915e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.443607e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.443607e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.186397 sec - 8,854,475,202 cycles # 2.775 GHz - 16,456,181,779 instructions # 1.86 insn per cycle - 3.191297678 seconds time elapsed +TOTAL : 3.113431 sec + 8,923,659,642 cycles # 2.862 GHz + 16,457,885,539 instructions # 1.84 insn per cycle + 3.118765386 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1359) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.906352e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.613901e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.613901e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.925447e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.643033e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.643033e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.717092 sec - 7,920,630,909 cycles # 2.128 GHz - 14,619,990,772 instructions # 1.85 insn per cycle - 3.722531495 seconds time elapsed +TOTAL : 3.681436 sec + 7,890,978,674 cycles # 2.141 GHz + 14,621,037,583 instructions # 1.85 insn per cycle + 3.686967166 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1004) (512y: 158) (512z: 960) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index e2fad0413c..3fc85a00ac 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:16:58 +DATE: 2025-12-07_18:37:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.326337e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.070850e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.905795e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.879940e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.956428e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.863525e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.687566 sec - 2,696,565,159 cycles # 2.829 GHz - 4,062,904,580 instructions # 1.51 insn per cycle - 1.010928380 seconds time elapsed +TOTAL : 0.688494 sec + 2,795,598,941 cycles # 2.930 GHz + 4,160,180,222 instructions # 1.49 insn per cycle + 1.014588467 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.043775e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.849543e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.849543e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.096983e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.942352e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.942352e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.494605 sec - 10,083,396,787 cycles # 2.882 GHz - 25,760,449,217 instructions # 2.55 insn per cycle - 3.499888853 seconds time elapsed +TOTAL : 3.405192 sec + 10,149,407,499 cycles # 2.977 GHz + 25,760,489,120 instructions # 2.54 insn per cycle + 3.410552016 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 246) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.297652e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.517332e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.517332e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.371044e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.633422e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.633422e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.161432 sec - 9,089,198,091 cycles # 2.871 GHz - 21,827,149,693 instructions # 2.40 insn per cycle - 3.166784889 seconds time elapsed +TOTAL : 3.062669 sec + 9,128,357,755 cycles # 2.976 GHz + 21,828,519,602 instructions # 2.39 insn per cycle + 3.068232537 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1116) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.295786e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.454015e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.454015e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.397839e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.621400e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.621400e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.158774 sec - 8,695,257,664 cycles # 2.749 GHz - 15,965,615,823 instructions # 1.84 insn per cycle - 3.164128836 seconds time elapsed +TOTAL : 3.026156 sec + 8,697,048,171 cycles # 2.870 GHz + 15,965,483,115 instructions # 1.84 insn per cycle + 3.031597373 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1484) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.398085e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.643924e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.643924e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.442847e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.704788e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.704788e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.034628 sec - 8,440,163,243 cycles # 2.777 GHz - 15,795,186,827 instructions # 1.87 insn per cycle - 3.039990401 seconds time elapsed +TOTAL : 2.978616 sec + 8,501,941,963 cycles # 2.850 GHz + 15,807,109,670 instructions # 1.86 insn per cycle + 2.984167522 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1288) (512y: 141) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.002688e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.799181e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.799181e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.106573e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.980339e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.980339e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.557099 sec - 7,607,771,698 cycles # 2.137 GHz - 14,233,174,966 instructions # 1.87 insn per cycle - 3.562310738 seconds time elapsed +TOTAL : 3.391515 sec + 7,633,582,988 cycles # 2.248 GHz + 14,233,877,564 instructions # 1.86 insn per cycle + 3.397176354 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 992) (512y: 158) (512z: 880) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling index a78c1b2deb..7f42c851c7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:40:18 +DATE: 2025-12-07_17:57:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.981251e+06 1 256 -6.047935e+06 2 256 -1.122832e+07 4 256 -2.252678e+07 8 256 -4.235605e+07 16 256 -8.416122e+07 32 256 -1.466169e+08 64 256 -3.049065e+08 128 256 -4.651176e+08 256 256 -6.085927e+08 512 256 -7.481343e+08 1024 256 +3.157376e+06 1 256 +6.103158e+06 2 256 +1.006082e+07 4 256 +2.405563e+07 8 256 +4.257354e+07 16 256 +8.951636e+07 32 256 +1.710426e+08 64 256 +3.157205e+08 128 256 +4.734405e+08 256 256 +6.391262e+08 512 256 +7.333397e+08 1024 256 ### GPU: scaling test 32 -4.108938e+05 1 32 -7.731896e+05 2 32 -1.472652e+06 4 32 -3.058688e+06 8 32 -4.923029e+06 16 32 -1.154805e+07 32 32 -2.237762e+07 64 32 -4.518229e+07 128 32 -7.698959e+07 256 32 -1.503754e+08 512 32 -2.942634e+08 1024 32 -4.027161e+08 2048 32 -5.199929e+08 4096 32 -5.853205e+08 8192 32 +4.338218e+05 1 32 +8.495500e+05 2 32 +1.561928e+06 4 32 +2.747548e+06 8 32 +6.510102e+06 16 32 +1.022701e+07 32 32 +2.490515e+07 64 32 +4.256867e+07 128 32 +8.780938e+07 256 32 +1.626284e+08 512 32 +2.845704e+08 1024 32 +3.804614e+08 2048 32 +5.075589e+08 4096 32 +5.841454e+08 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.083777e+06 1 256 -1.126195e+06 2 256 -1.126272e+06 4 256 +1.136349e+06 1 256 +1.121914e+06 2 256 +1.139956e+06 4 256 ### CPU: scaling test 32 -1.086034e+06 1 32 -1.116071e+06 2 32 -1.128798e+06 4 32 +8.086731e+05 1 32 +1.113256e+06 2 32 +1.045495e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.853894e+06 1 256 -3.152865e+06 2 256 -3.025871e+06 4 256 +2.840657e+06 1 256 +2.900324e+06 2 256 +2.840121e+06 4 256 ### CPU: scaling test 32 -2.851034e+06 1 32 -2.925313e+06 2 32 -2.581790e+06 4 32 +2.906713e+06 1 32 +3.135718e+06 2 32 +2.561793e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.276087e+06 1 256 -3.611916e+06 2 256 -3.183634e+06 4 256 +3.568690e+06 1 256 +3.619142e+06 2 256 +3.570183e+06 4 256 ### CPU: scaling test 32 -3.073082e+06 1 32 -3.375349e+06 2 32 -2.927052e+06 4 32 +1.577754e+06 1 32 +3.191702e+06 2 32 +3.498319e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.662480e+06 1 256 -3.408266e+06 2 256 -3.661694e+06 4 256 +3.662427e+06 1 256 +3.455840e+06 2 256 +3.567074e+06 4 256 ### CPU: scaling test 32 -1.789109e+06 1 32 -3.449949e+06 2 32 -3.560402e+06 4 32 +3.188522e+06 1 32 +3.427959e+06 2 32 +3.322345e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.254224e+06 1 256 -3.401880e+06 2 256 -3.536803e+06 4 256 +3.187251e+06 1 256 +3.454161e+06 2 256 +3.455642e+06 4 256 ### CPU: scaling test 32 -1.684033e+06 1 32 -2.687382e+06 2 32 -2.916448e+06 4 32 +3.991518e+06 1 32 +2.538575e+06 2 32 +2.974254e+06 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 9dacd0443a..01651ec87e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:16:08 +DATE: 2025-12-07_17:33:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.223637e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.675161e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.645637e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.347836e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.805183e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.918065e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.588199 sec - 2,408,587,167 cycles # 2.842 GHz - 3,683,823,828 instructions # 1.53 insn per cycle - 0.903961148 seconds time elapsed +TOTAL : 0.575049 sec + 2,409,585,784 cycles # 2.916 GHz + 3,671,404,896 instructions # 1.52 insn per cycle + 0.885611906 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.035251e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.217456e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.217456e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.069468e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.257538e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.257538e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.454566 sec - 18,664,660,450 cycles # 2.890 GHz - 45,251,843,843 instructions # 2.42 insn per cycle - 6.459911913 seconds time elapsed +TOTAL : 6.246770 sec + 18,656,170,333 cycles # 2.985 GHz + 45,252,855,956 instructions # 2.43 insn per cycle + 6.251582459 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.213678e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.366853e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366853e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.295853e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489405e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489405e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.221547 sec - 9,347,928,391 cycles # 2.898 GHz - 22,375,063,737 instructions # 2.39 insn per cycle - 3.226933374 seconds time elapsed +TOTAL : 3.108360 sec + 9,295,230,853 cycles # 2.986 GHz + 22,375,421,447 instructions # 2.41 insn per cycle + 3.113256520 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.361341e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.581474e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.581474e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.425607e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.671187e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.671187e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.041655 sec - 8,385,705,935 cycles # 2.753 GHz - 15,815,253,481 instructions # 1.89 insn per cycle - 3.046966557 seconds time elapsed +TOTAL : 2.959349 sec + 8,389,814,530 cycles # 2.831 GHz + 15,815,454,579 instructions # 1.89 insn per cycle + 2.964069012 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.426573e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.714317e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.714317e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.476389e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.793725e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.793725e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.970277 sec - 8,276,306,484 cycles # 2.782 GHz - 15,653,687,115 instructions # 1.89 insn per cycle - 2.975610452 seconds time elapsed +TOTAL : 2.907669 sec + 8,278,733,530 cycles # 2.844 GHz + 15,648,696,957 instructions # 1.89 insn per cycle + 2.912381859 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.392250e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.619370e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.619370e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.509388e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.836050e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.836050e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.010134 sec - 6,663,148,382 cycles # 2.210 GHz - 12,894,118,429 instructions # 1.94 insn per cycle - 3.015621591 seconds time elapsed +TOTAL : 2.875338 sec + 6,628,866,150 cycles # 2.302 GHz + 12,894,171,805 instructions # 1.95 insn per cycle + 2.880295553 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 215370ad38..fd0e41cb29 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:28:03 +DATE: 2025-12-07_18:47:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.220206e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.249013e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.249013e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.177718e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.780840e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.780840e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.704287 sec - 5,590,644,626 cycles # 2.843 GHz - 10,005,372,723 instructions # 1.79 insn per cycle - 2.022727811 seconds time elapsed +TOTAL : 1.693072 sec + 5,726,531,729 cycles # 2.930 GHz + 10,190,014,908 instructions # 1.78 insn per cycle + 2.011152985 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.010617e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.186955e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.186955e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.007387e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180929e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180929e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.713335 sec - 19,329,941,883 cycles # 2.877 GHz - 45,365,505,516 instructions # 2.35 insn per cycle - 6.720261817 seconds time elapsed +TOTAL : 6.732165 sec + 19,286,403,747 cycles # 2.863 GHz + 45,366,749,456 instructions # 2.35 insn per cycle + 6.738552576 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.128665e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.170237e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.170237e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.097741e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.125158e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.125158e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.459266 sec - 10,015,354,665 cycles # 2.890 GHz - 23,673,664,836 instructions # 2.36 insn per cycle - 3.466212345 seconds time elapsed +TOTAL : 3.510876 sec + 10,052,796,603 cycles # 2.859 GHz + 23,674,430,124 instructions # 2.36 insn per cycle + 3.517462594 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.263697e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.371457e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.371457e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.244261e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.326512e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.286775 sec - 9,106,177,679 cycles # 2.766 GHz - 16,899,675,653 instructions # 1.86 insn per cycle - 3.293662887 seconds time elapsed +TOTAL : 3.314495 sec + 9,100,046,960 cycles # 2.742 GHz + 16,900,314,239 instructions # 1.86 insn per cycle + 3.321145894 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.302738e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.462511e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.462511e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.293820e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.417956e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.417956e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.240690 sec - 8,985,254,061 cycles # 2.768 GHz - 16,737,997,718 instructions # 1.86 insn per cycle - 3.247472027 seconds time elapsed +TOTAL : 3.250499 sec + 8,993,226,420 cycles # 2.762 GHz + 16,740,357,657 instructions # 1.86 insn per cycle + 3.256748984 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.254993e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.321155e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.321155e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.325345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.449019e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.449019e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.302457 sec - 7,458,897,279 cycles # 2.255 GHz - 14,069,459,173 instructions # 1.89 insn per cycle - 3.309041869 seconds time elapsed +TOTAL : 3.212065 sec + 7,440,034,503 cycles # 2.313 GHz + 14,066,058,158 instructions # 1.89 insn per cycle + 3.218550054 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index c35f97f2b8..588a59c70a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:43:25 +DATE: 2025-12-07_19:02:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.253381e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.370790e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.518342e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.251709e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.531351e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.711358e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.218481 sec - 4,207,892,724 cycles # 2.859 GHz - 6,617,854,340 instructions # 1.57 insn per cycle - 1.530363886 seconds time elapsed +TOTAL : 1.183963 sec + 4,198,988,359 cycles # 2.922 GHz + 6,634,191,973 instructions # 1.58 insn per cycle + 1.493322587 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.036512e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.218588e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.218588e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.066412e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.254599e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.254599e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.791690 sec - 19,679,660,217 cycles # 2.896 GHz - 45,434,399,439 instructions # 2.31 insn per cycle - 6.797219573 seconds time elapsed +TOTAL : 6.599744 sec + 19,672,631,205 cycles # 2.979 GHz + 45,434,648,704 instructions # 2.31 insn per cycle + 6.604850374 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.200562e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.338496e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.338496e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.282308e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.482995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.482995e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.583516 sec - 10,308,901,515 cycles # 2.874 GHz - 22,457,815,111 instructions # 2.18 insn per cycle - 3.588832664 seconds time elapsed +TOTAL : 3.454974 sec + 10,323,888,918 cycles # 2.985 GHz + 22,457,053,676 instructions # 2.18 insn per cycle + 3.459824407 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.344557e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.579879e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.579879e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.446086e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.737147e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.737147e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.404488 sec - 9,434,839,609 cycles # 2.768 GHz - 15,726,735,545 instructions # 1.67 insn per cycle - 3.409840593 seconds time elapsed +TOTAL : 3.266814 sec + 9,416,898,872 cycles # 2.879 GHz + 15,726,685,233 instructions # 1.67 insn per cycle + 3.271893169 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.407789e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.709415e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.709415e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.491369e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837739e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837739e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.341843 sec - 9,335,373,029 cycles # 2.790 GHz - 15,365,478,048 instructions # 1.65 insn per cycle - 3.347112669 seconds time elapsed +TOTAL : 3.231379 sec + 9,319,723,592 cycles # 2.881 GHz + 15,364,079,397 instructions # 1.65 insn per cycle + 3.236344812 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.374032e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.592267e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.592267e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.508717e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837359e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837359e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.383460 sec - 7,651,857,041 cycles # 2.259 GHz - 12,604,317,732 instructions # 1.65 insn per cycle - 3.388617759 seconds time elapsed +TOTAL : 3.215440 sec + 7,660,823,057 cycles # 2.380 GHz + 12,605,053,694 instructions # 1.65 insn per cycle + 3.220376167 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 4fe47b6309..b5ddef1889 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:39:57 +DATE: 2025-12-07_18:59:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.232997e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.388992e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.560013e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.244697e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.532699e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.686818e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.882532 sec - 3,214,322,203 cycles # 2.828 GHz - 6,452,752,496 instructions # 2.01 insn per cycle - 1.194579493 seconds time elapsed +TOTAL : 0.869375 sec + 3,256,437,462 cycles # 2.905 GHz + 6,549,133,015 instructions # 2.01 insn per cycle + 1.178429386 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.031419e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.212428e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.212428e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.070331e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.258607e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.258607e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.477368 sec - 18,661,812,568 cycles # 2.879 GHz - 45,252,341,321 instructions # 2.42 insn per cycle - 6.482693144 seconds time elapsed +TOTAL : 6.241816 sec + 18,685,324,053 cycles # 2.992 GHz + 45,251,752,352 instructions # 2.42 insn per cycle + 6.246965777 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.196497e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.342466e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.342466e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.276952e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.460848e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.460848e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.247962 sec - 9,353,957,329 cycles # 2.876 GHz - 22,375,680,082 instructions # 2.39 insn per cycle - 3.253308897 seconds time elapsed +TOTAL : 3.133290 sec + 9,346,560,119 cycles # 2.979 GHz + 22,377,421,466 instructions # 2.39 insn per cycle + 3.138163507 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.352259e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566980e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.566980e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.422297e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.671733e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.671733e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.051523 sec - 8,419,136,103 cycles # 2.756 GHz - 15,815,678,204 instructions # 1.88 insn per cycle - 3.056921587 seconds time elapsed +TOTAL : 2.965685 sec + 8,408,515,886 cycles # 2.831 GHz + 15,815,239,505 instructions # 1.88 insn per cycle + 2.970696334 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.409169e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.699321e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.699321e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.503506e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.841324e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.841324e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.991131 sec - 8,296,340,422 cycles # 2.770 GHz - 15,649,217,834 instructions # 1.89 insn per cycle - 2.996375115 seconds time elapsed +TOTAL : 2.876896 sec + 8,271,897,673 cycles # 2.871 GHz + 15,649,269,823 instructions # 1.89 insn per cycle + 2.881998354 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.362594e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.567971e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.567971e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.501191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.832976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.832976e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.046737 sec - 6,657,108,236 cycles # 2.182 GHz - 12,894,608,228 instructions # 1.94 insn per cycle - 3.052164277 seconds time elapsed +TOTAL : 2.885803 sec + 6,667,549,027 cycles # 2.307 GHz + 12,894,419,244 instructions # 1.93 insn per cycle + 2.890893342 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index a89730724c..fcd5a1d815 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:36:29 +DATE: 2025-12-07_18:56:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.680186e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.389167e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.490052e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.942886e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.462261e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.577243e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.528523 sec - 5,119,450,809 cycles # 2.867 GHz - 9,180,981,618 instructions # 1.79 insn per cycle - 1.841912956 seconds time elapsed +TOTAL : 1.493949 sec + 5,132,751,598 cycles # 2.932 GHz + 9,273,220,495 instructions # 1.81 insn per cycle + 1.806488011 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.028340e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.213140e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.213140e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.060408e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.246776e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.246776e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.495821 sec - 18,726,914,707 cycles # 2.881 GHz - 45,252,147,765 instructions # 2.42 insn per cycle - 6.501028276 seconds time elapsed +TOTAL : 6.299423 sec + 18,681,964,178 cycles # 2.964 GHz + 45,253,374,151 instructions # 2.42 insn per cycle + 6.304770677 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.215291e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.366977e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366977e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.273103e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.459728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.459728e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.221927 sec - 9,338,555,823 cycles # 2.895 GHz - 22,375,290,209 instructions # 2.40 insn per cycle - 3.227594710 seconds time elapsed +TOTAL : 3.140575 sec + 9,347,181,281 cycles # 2.973 GHz + 22,375,222,796 instructions # 2.39 insn per cycle + 3.145734897 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.376691e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.618820e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.618820e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.458024e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.740265e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.740265e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.021316 sec - 8,423,872,827 cycles # 2.784 GHz - 15,815,022,260 instructions # 1.88 insn per cycle - 3.026847541 seconds time elapsed +TOTAL : 2.924345 sec + 8,407,131,354 cycles # 2.871 GHz + 15,815,327,836 instructions # 1.88 insn per cycle + 2.929459615 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.398006e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.678623e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.678623e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.492494e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.828517e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.828517e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.003583 sec - 8,296,430,270 cycles # 2.758 GHz - 15,653,949,933 instructions # 1.89 insn per cycle - 3.009064332 seconds time elapsed +TOTAL : 2.892872 sec + 8,288,121,972 cycles # 2.861 GHz + 15,654,934,223 instructions # 1.89 insn per cycle + 2.897821211 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.376583e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.598108e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.598108e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.503975e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.843996e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.843996e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.029921 sec - 6,657,348,870 cycles # 2.194 GHz - 12,894,427,961 instructions # 1.94 insn per cycle - 3.035366895 seconds time elapsed +TOTAL : 2.880221 sec + 6,664,616,674 cycles # 2.311 GHz + 12,893,939,825 instructions # 1.93 insn per cycle + 2.885338246 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 1a227eb682..eb2dd9920e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:16:39 +DATE: 2025-12-07_17:34:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.199628e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.780940e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.098104e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.351402e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.881780e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.055553e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.592040 sec - 2,436,367,118 cycles # 2.822 GHz - 3,629,290,640 instructions # 1.49 insn per cycle - 0.920365880 seconds time elapsed +TOTAL : 0.571551 sec + 2,432,233,482 cycles # 2.913 GHz + 3,687,865,540 instructions # 1.52 insn per cycle + 0.891949542 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.039860e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.223391e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.223391e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.070764e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.258728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.258728e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.427980 sec - 18,659,345,357 cycles # 2.901 GHz - 45,239,622,020 instructions # 2.42 insn per cycle - 6.433370102 seconds time elapsed +TOTAL : 6.237937 sec + 18,630,314,995 cycles # 2.985 GHz + 45,238,975,906 instructions # 2.43 insn per cycle + 6.242848987 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.201529e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.346468e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.346468e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.282308e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.463918e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.463918e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.240561 sec - 9,296,413,050 cycles # 2.865 GHz - 22,342,996,788 instructions # 2.40 insn per cycle - 3.245872745 seconds time elapsed +TOTAL : 3.125561 sec + 9,325,718,243 cycles # 2.980 GHz + 22,343,516,057 instructions # 2.40 insn per cycle + 3.130348710 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1946) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.385031e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.622316e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.622316e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.464319e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742686e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742686e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.012220 sec - 8,383,528,688 cycles # 2.779 GHz - 15,803,482,216 instructions # 1.89 insn per cycle - 3.017661777 seconds time elapsed +TOTAL : 2.913036 sec + 8,369,189,226 cycles # 2.869 GHz + 15,803,773,397 instructions # 1.89 insn per cycle + 2.917845112 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.412617e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.685973e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.685973e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.505151e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.838610e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.838610e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.983146 sec - 8,252,716,563 cycles # 2.763 GHz - 15,642,709,201 instructions # 1.90 insn per cycle - 2.988589217 seconds time elapsed +TOTAL : 2.875401 sec + 8,261,747,915 cycles # 2.870 GHz + 15,648,667,301 instructions # 1.89 insn per cycle + 2.880161068 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2444) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.388549e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.619875e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.619875e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.515744e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.858685e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.858685e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.016137 sec - 6,649,228,149 cycles # 2.204 GHz - 12,869,205,720 instructions # 1.94 insn per cycle - 3.020818387 seconds time elapsed +TOTAL : 2.867642 sec + 6,642,980,980 cycles # 2.314 GHz + 12,871,727,740 instructions # 1.94 insn per cycle + 2.872374855 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1672) (512y: 5) (512z: 1432) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 38262df32b..34672b6850 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:17:26 +DATE: 2025-12-07_18:37:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.225159e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.730992e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.784746e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.176085e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.316562e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.531586e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.586772 sec - 2,390,848,405 cycles # 2.830 GHz - 3,635,852,069 instructions # 1.52 insn per cycle - 0.901933192 seconds time elapsed +TOTAL : 0.589247 sec + 2,469,811,532 cycles # 2.882 GHz + 3,762,952,137 instructions # 1.52 insn per cycle + 0.916543888 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.580341e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.051291e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.051291e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.634356e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.118226e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.118226e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.360853 sec - 12,448,339,745 cycles # 2.853 GHz - 32,675,928,488 instructions # 2.62 insn per cycle - 4.365774305 seconds time elapsed +TOTAL : 4.215215 sec + 12,424,758,492 cycles # 2.944 GHz + 32,673,037,566 instructions # 2.63 insn per cycle + 4.220486016 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 289) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.653591e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.483795e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.483795e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.720954e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.596976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.596976e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.750086 sec - 7,984,215,270 cycles # 2.899 GHz - 18,676,669,518 instructions # 2.34 insn per cycle - 2.755384632 seconds time elapsed +TOTAL : 2.683542 sec + 8,022,059,691 cycles # 2.985 GHz + 18,676,897,237 instructions # 2.33 insn per cycle + 2.689016088 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1518) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.732255e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.524982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.524982e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.837388e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.687280e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.687280e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.676787 sec - 7,485,834,946 cycles # 2.792 GHz - 14,289,880,775 instructions # 1.91 insn per cycle - 2.681721539 seconds time elapsed +TOTAL : 2.579172 sec + 7,464,397,072 cycles # 2.889 GHz + 14,290,665,215 instructions # 1.91 insn per cycle + 2.584588842 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2235) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.815938e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.713073e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.713073e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.901070e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.866377e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.866377e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.610308 sec - 7,285,805,876 cycles # 2.787 GHz - 14,002,821,074 instructions # 1.92 insn per cycle - 2.615329640 seconds time elapsed +TOTAL : 2.534133 sec + 7,318,114,298 cycles # 2.883 GHz + 14,003,153,559 instructions # 1.91 insn per cycle + 2.539307882 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2090) (512y: 3) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.445558e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.751827e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.751827e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.561924e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.973572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.973572e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.952535 sec - 6,541,372,214 cycles # 2.212 GHz - 13,442,784,339 instructions # 2.06 insn per cycle - 2.957547644 seconds time elapsed +TOTAL : 2.824573 sec + 6,557,174,199 cycles # 2.318 GHz + 13,442,397,342 instructions # 2.05 insn per cycle + 2.829776123 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2077) (512y: 0) (512z: 1195) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 47c3a6f771..5d713b3053 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:17:52 +DATE: 2025-12-07_18:38:18 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.230358e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.785974e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.903505e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.173287e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.350308e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.616890e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.585637 sec - 2,395,685,093 cycles # 2.840 GHz - 3,632,202,579 instructions # 1.52 insn per cycle - 0.900792937 seconds time elapsed +TOTAL : 0.584646 sec + 2,459,654,708 cycles # 2.913 GHz + 3,678,133,585 instructions # 1.50 insn per cycle + 0.902025273 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.167434e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.153946e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.153946e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.231539e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.241919e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.241919e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.280436 sec - 9,351,045,236 cycles # 2.847 GHz - 25,523,046,940 instructions # 2.73 insn per cycle - 3.285902426 seconds time elapsed +TOTAL : 3.185579 sec + 9,360,419,493 cycles # 2.934 GHz + 25,521,774,182 instructions # 2.73 insn per cycle + 3.190988818 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 243) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.975132e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.504192e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.504192e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.061323e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.660612e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.660612e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.494622 sec - 7,225,776,791 cycles # 2.892 GHz - 16,897,519,367 instructions # 2.34 insn per cycle - 2.499894449 seconds time elapsed +TOTAL : 2.425184 sec + 7,255,843,555 cycles # 2.986 GHz + 16,897,462,341 instructions # 2.33 insn per cycle + 2.430429644 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.863069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.858307e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.858307e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.907409e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.889383e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.889383e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.571321 sec - 7,197,624,768 cycles # 2.795 GHz - 13,687,331,488 instructions # 1.90 insn per cycle - 2.576243151 seconds time elapsed +TOTAL : 2.526917 sec + 7,301,571,235 cycles # 2.885 GHz + 13,688,145,482 instructions # 1.87 insn per cycle + 2.532254181 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2063) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.912761e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.069621e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.069621e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.006834e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.183679e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.183679e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.533153 sec - 7,100,141,299 cycles # 2.799 GHz - 13,497,970,451 instructions # 1.90 insn per cycle - 2.538056554 seconds time elapsed +TOTAL : 2.458837 sec + 7,065,988,283 cycles # 2.868 GHz + 13,495,720,790 instructions # 1.91 insn per cycle + 2.464304737 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 3) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.512964e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.923122e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.923122e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.614978e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.125849e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.125849e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.885451 sec - 6,375,003,514 cycles # 2.206 GHz - 13,181,689,692 instructions # 2.07 insn per cycle - 2.890749023 seconds time elapsed +TOTAL : 2.777597 sec + 6,420,076,367 cycles # 2.308 GHz + 13,182,048,562 instructions # 2.05 insn per cycle + 2.782852923 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2031) (512y: 1) (512z: 1091) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling index 78116e7085..3b29ee6d83 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:39:57 +DATE: 2025-12-07_17:56:48 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.811025e+06 1 256 -5.675268e+06 2 256 -1.125473e+07 4 256 -2.237542e+07 8 256 -4.084889e+07 16 256 -8.038307e+07 32 256 -1.408431e+08 64 256 -2.087041e+08 128 256 -2.617085e+08 256 256 -3.164102e+08 512 256 -3.490720e+08 1024 256 +2.632282e+06 1 256 +6.366813e+06 2 256 +1.215329e+07 4 256 +2.086772e+07 8 256 +3.966648e+07 16 256 +8.884743e+07 32 256 +1.506395e+08 64 256 +2.145963e+08 128 256 +2.680036e+08 256 256 +3.181305e+08 512 256 +3.560177e+08 1024 256 ### GPU: scaling test 32 -3.990821e+05 1 32 -7.057552e+05 2 32 -1.416039e+06 4 32 -2.964129e+06 8 32 -5.593795e+06 16 32 -1.165053e+07 32 32 -2.163693e+07 64 32 -4.137165e+07 128 32 -7.520702e+07 256 32 -1.314590e+08 512 32 -1.948562e+08 1024 32 -2.786288e+08 2048 32 -3.116503e+08 4096 32 -3.644493e+08 8192 32 +3.810023e+05 1 32 +8.507019e+05 2 32 +1.626347e+06 4 32 +2.754525e+06 8 32 +6.062544e+06 16 32 +1.205089e+07 32 32 +2.364459e+07 64 32 +4.428012e+07 128 32 +8.468759e+07 256 32 +1.418762e+08 512 32 +2.153664e+08 1024 32 +2.834847e+08 2048 32 +3.357730e+08 4096 32 +3.681214e+08 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.058031e+06 1 256 -1.064708e+06 2 256 -1.091924e+06 4 256 +1.092262e+06 1 256 +1.011848e+06 2 256 +1.045423e+06 4 256 ### CPU: scaling test 32 -9.653674e+05 1 32 -1.073826e+06 2 32 -1.086320e+06 4 32 +7.288464e+05 1 32 +1.073177e+06 2 32 +1.011986e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.851906e+06 1 256 -1.832695e+06 2 256 -1.916161e+06 4 256 +1.750643e+06 1 256 +1.713836e+06 2 256 +1.871400e+06 4 256 ### CPU: scaling test 32 -1.906351e+06 1 32 -1.246470e+06 2 32 -1.664802e+06 4 32 +1.926434e+06 1 32 +1.891421e+06 2 32 +1.932805e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.709626e+06 1 256 -2.644942e+06 2 256 -2.445350e+06 4 256 +2.427875e+06 1 256 +2.214274e+06 2 256 +2.689344e+06 4 256 ### CPU: scaling test 32 -2.186539e+06 1 32 -2.363281e+06 2 32 -2.641954e+06 4 32 +2.359708e+06 1 32 +2.361972e+06 2 32 +2.646433e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.767179e+06 1 256 -2.686691e+06 2 256 -2.759654e+06 4 256 +2.772363e+06 1 256 +2.799558e+06 2 256 +2.762498e+06 4 256 ### CPU: scaling test 32 -1.340876e+06 1 32 -2.416645e+06 2 32 -2.506708e+06 4 32 +2.402944e+06 1 32 +2.422774e+06 2 32 +2.714855e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.171313e+06 1 256 -2.276072e+06 2 256 -2.282286e+06 4 256 +2.144305e+06 1 256 +2.128137e+06 2 256 +2.268367e+06 4 256 ### CPU: scaling test 32 -1.265823e+06 1 32 -1.671673e+06 2 32 -2.039028e+06 4 32 +1.303356e+06 1 32 +1.725346e+06 2 32 +1.614612e+06 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index caf7cf3a58..bf1e8173ed 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:14:54 +DATE: 2025-12-07_17:32:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.254014e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.994980e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.902542e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.707020e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.089402e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.911532e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.693324 sec - 2,725,071,311 cycles # 2.836 GHz - 4,080,796,637 instructions # 1.50 insn per cycle - 1.023122717 seconds time elapsed +TOTAL : 0.672637 sec + 2,728,265,379 cycles # 2.919 GHz + 4,086,957,022 instructions # 1.50 insn per cycle + 0.994936782 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.004559e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.167053e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.034838e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200526e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200526e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.681187 sec - 19,310,569,163 cycles # 2.888 GHz - 46,561,074,047 instructions # 2.41 insn per cycle - 6.686779372 seconds time elapsed +TOTAL : 6.481862 sec + 19,306,388,487 cycles # 2.977 GHz + 46,560,452,569 instructions # 2.41 insn per cycle + 6.486899910 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.592071e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.095366e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.095366e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.646393e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.167481e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.167481e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.374152 sec - 12,572,513,674 cycles # 2.872 GHz - 31,463,286,168 instructions # 2.50 insn per cycle - 4.379862583 seconds time elapsed +TOTAL : 4.233282 sec + 12,545,358,324 cycles # 2.961 GHz + 31,461,646,193 instructions # 2.51 insn per cycle + 4.238198097 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.938324e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.700921e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.700921e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.017174e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.804577e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.804577e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.662440 sec - 10,121,778,715 cycles # 2.760 GHz - 19,471,159,122 instructions # 1.92 insn per cycle - 3.668260640 seconds time elapsed +TOTAL : 3.521214 sec + 10,038,452,005 cycles # 2.847 GHz + 19,471,417,193 instructions # 1.94 insn per cycle + 3.526071416 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.971771e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.738449e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.738449e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.040312e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.835469e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.835469e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.605464 sec - 9,883,989,440 cycles # 2.738 GHz - 19,284,997,724 instructions # 1.95 insn per cycle - 3.611144081 seconds time elapsed +TOTAL : 3.492419 sec + 9,879,096,278 cycles # 2.826 GHz + 19,285,454,606 instructions # 1.95 insn per cycle + 3.497439685 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1786) (512y: 191) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.763507e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.351410e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.351410e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.818980e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.432178e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.432178e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.983402 sec - 8,347,852,448 cycles # 2.093 GHz - 14,994,758,047 instructions # 1.80 insn per cycle - 3.989072483 seconds time elapsed +TOTAL : 3.867452 sec + 8,347,555,236 cycles # 2.156 GHz + 14,995,323,167 instructions # 1.80 insn per cycle + 3.872483648 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 952) (512y: 154) (512z: 1313) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index f781dc1bb5..17cd6abba8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:15:31 +DATE: 2025-12-07_17:33:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.263252e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.017320e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.920339e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.724896e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.114491e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.946526e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.689357 sec - 2,740,273,431 cycles # 2.852 GHz - 4,084,188,832 instructions # 1.49 insn per cycle - 1.021206637 seconds time elapsed +TOTAL : 0.670484 sec + 2,714,515,604 cycles # 2.910 GHz + 4,127,913,209 instructions # 1.52 insn per cycle + 0.992575981 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.004380e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167437e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.167437e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.034507e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201771e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.201771e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.681530 sec - 19,329,038,472 cycles # 2.891 GHz - 46,534,784,670 instructions # 2.41 insn per cycle - 6.687165929 seconds time elapsed +TOTAL : 6.485887 sec + 19,304,325,248 cycles # 2.975 GHz + 46,537,637,564 instructions # 2.41 insn per cycle + 6.491015459 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.608782e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.123511e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.123511e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.666393e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.198515e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.198515e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.330389 sec - 12,526,304,265 cycles # 2.890 GHz - 31,429,125,016 instructions # 2.51 insn per cycle - 4.336065673 seconds time elapsed +TOTAL : 4.185229 sec + 12,517,954,154 cycles # 2.988 GHz + 31,429,177,076 instructions # 2.51 insn per cycle + 4.190312345 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.942808e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.702933e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.702933e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.998664e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.788117e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.788117e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.652389 sec - 10,126,359,115 cycles # 2.769 GHz - 19,454,993,368 instructions # 1.92 insn per cycle - 3.658235344 seconds time elapsed +TOTAL : 3.549369 sec + 10,129,588,690 cycles # 2.851 GHz + 19,455,084,393 instructions # 1.92 insn per cycle + 3.554455132 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2019) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.957600e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.738598e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.738598e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.049149e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.867183e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.867183e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.629719 sec - 9,979,298,276 cycles # 2.746 GHz - 19,273,169,438 instructions # 1.93 insn per cycle - 3.635438116 seconds time elapsed +TOTAL : 3.468805 sec + 9,914,475,933 cycles # 2.855 GHz + 19,273,303,426 instructions # 1.94 insn per cycle + 3.473774431 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1773) (512y: 191) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.800984e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.418771e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.418771e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.905625e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577538e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577538e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.911829 sec - 8,199,622,084 cycles # 2.094 GHz - 14,847,008,944 instructions # 1.81 insn per cycle - 3.917306895 seconds time elapsed +TOTAL : 3.703342 sec + 8,187,263,740 cycles # 2.209 GHz + 14,847,327,904 instructions # 1.81 insn per cycle + 3.708507538 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 941) (512y: 155) (512z: 1281) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling index 4703fd43b7..9a0cac6210 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:40:39 +DATE: 2025-12-07_17:57:30 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.383253e+06 1 256 -2.893064e+06 2 256 -5.376118e+06 4 256 -1.185151e+07 8 256 -2.346081e+07 16 256 -4.511286e+07 32 256 -5.630221e+07 64 256 -6.196121e+07 128 256 -6.780047e+07 256 256 -7.309787e+07 512 256 -7.376814e+07 1024 256 +1.504360e+06 1 256 +2.824672e+06 2 256 +6.056305e+06 4 256 +1.191758e+07 8 256 +2.405125e+07 16 256 +4.536041e+07 32 256 +5.939029e+07 64 256 +6.287802e+07 128 256 +6.881027e+07 256 256 +7.343192e+07 512 256 +7.490252e+07 1024 256 ### GPU: scaling test 32 -1.722124e+05 1 32 -3.905487e+05 2 32 -6.832898e+05 4 32 -1.517739e+06 8 32 -2.835858e+06 16 32 -6.130048e+06 32 32 -1.120344e+07 64 32 -2.084478e+07 128 32 -4.106718e+07 256 32 -5.763008e+07 512 32 -6.090072e+07 1024 32 -6.706632e+07 2048 32 -7.231618e+07 4096 32 -7.501823e+07 8192 32 +2.071291e+05 1 32 +3.726520e+05 2 32 +7.472358e+05 4 32 +1.647531e+06 8 32 +3.318663e+06 16 32 +6.183351e+06 32 32 +1.286262e+07 64 32 +2.130714e+07 128 32 +4.478191e+07 256 32 +5.743270e+07 512 32 +6.289637e+07 1024 32 +6.800061e+07 2048 32 +7.282154e+07 4096 32 +7.497172e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.767984e+05 1 256 -1.796605e+05 2 256 -1.802476e+05 4 256 +1.743771e+05 1 256 +1.789615e+05 2 256 +1.793983e+05 4 256 ### CPU: scaling test 32 -1.472612e+05 1 32 -1.715919e+05 2 32 -1.711413e+05 4 32 +1.653354e+05 1 32 +1.670765e+05 2 32 +1.696164e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.982512e+05 1 256 -3.086531e+05 2 256 -3.162558e+05 4 256 +2.972096e+05 1 256 +3.127377e+05 2 256 +3.171063e+05 4 256 ### CPU: scaling test 32 -2.995750e+05 1 32 -2.938112e+05 2 32 -2.996907e+05 4 32 +2.804140e+05 1 32 +2.801452e+05 2 32 +2.952152e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.811704e+05 1 256 -4.983434e+05 2 256 -5.240082e+05 4 256 +5.256328e+05 1 256 +5.088887e+05 2 256 +5.167831e+05 4 256 ### CPU: scaling test 32 -4.296686e+05 1 32 -4.897722e+05 2 32 -4.790509e+05 4 32 +5.200293e+05 1 32 +5.299900e+05 2 32 +5.167020e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.039122e+05 1 256 -5.537973e+05 2 256 -5.292318e+05 4 256 +5.501827e+05 1 256 +5.527856e+05 2 256 +5.527045e+05 4 256 ### CPU: scaling test 32 -5.049628e+05 1 32 -5.163039e+05 2 32 -5.558813e+05 4 32 +4.798825e+05 1 32 +5.528346e+05 2 32 +5.568946e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.352738e+05 1 256 -3.531052e+05 2 256 -3.524363e+05 4 256 +3.500611e+05 1 256 +3.500527e+05 2 256 +3.555787e+05 4 256 ### CPU: scaling test 32 -3.508580e+05 1 32 -3.508926e+05 2 32 -3.509426e+05 4 32 +3.522677e+05 1 32 +3.561610e+05 2 32 +3.593426e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index b83fe948f8..f54c9bf039 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:17:08 +DATE: 2025-12-07_17:34:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.814869e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.187282e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.582493e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.912778e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.221138e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.596615e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.541191 sec - 2,309,968,372 cycles # 2.848 GHz - 3,226,495,089 instructions # 1.40 insn per cycle - 0.869698260 seconds time elapsed +TOTAL : 0.534079 sec + 2,244,205,132 cycles # 2.820 GHz + 3,197,191,815 instructions # 1.42 insn per cycle + 0.853682446 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.792870e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839272e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839272e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.842360e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.889130e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.889130e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.956913 sec - 17,261,214,247 cycles # 2.896 GHz - 46,320,121,297 instructions # 2.68 insn per cycle - 5.962421755 seconds time elapsed +TOTAL : 5.797048 sec + 17,254,810,947 cycles # 2.975 GHz + 46,321,303,307 instructions # 2.68 insn per cycle + 5.802070999 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.087487e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238823e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238823e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.222423e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.381794e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.381794e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.506189 sec - 10,088,639,728 cycles # 2.873 GHz - 27,919,288,717 instructions # 2.77 insn per cycle - 3.512045055 seconds time elapsed +TOTAL : 3.359528 sec + 10,055,422,451 cycles # 2.989 GHz + 27,919,716,702 instructions # 2.78 insn per cycle + 3.364655765 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.914379e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.288444e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.288444e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.918104e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.281264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.281264e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.241997 sec - 6,102,243,675 cycles # 2.716 GHz - 12,609,784,840 instructions # 2.07 insn per cycle - 2.247857659 seconds time elapsed +TOTAL : 2.237782 sec + 6,089,499,133 cycles # 2.717 GHz + 12,610,527,635 instructions # 2.07 insn per cycle + 2.242751421 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.130809e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.541182e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.541182e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.346793e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.775278e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.775278e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.151754 sec - 5,849,443,539 cycles # 2.712 GHz - 12,186,163,621 instructions # 2.08 insn per cycle - 2.157524773 seconds time elapsed +TOTAL : 2.065501 sec + 5,826,057,880 cycles # 2.815 GHz + 12,184,898,885 instructions # 2.09 insn per cycle + 2.070670453 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.453655e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.631223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.631223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.506507e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.684920e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.684920e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.144840 sec - 5,734,260,839 cycles # 1.821 GHz - 8,277,135,516 instructions # 1.44 insn per cycle - 3.150611128 seconds time elapsed +TOTAL : 3.095462 sec + 5,726,660,139 cycles # 1.848 GHz + 8,279,441,648 instructions # 1.45 insn per cycle + 3.100631344 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling index 28ed30edba..aa3e04a1d0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:54:51 +DATE: 2025-12-07_18:11:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -4.305698e+05 1 256 -8.421080e+05 2 256 -1.658112e+06 4 256 -2.989838e+06 8 256 -4.972377e+06 16 256 -7.105357e+06 32 256 -9.196651e+06 64 256 -1.028995e+07 128 256 -1.118682e+07 256 256 -1.170520e+07 512 256 -1.194760e+07 1024 256 +4.319786e+05 1 256 +9.574710e+05 2 256 +1.780811e+06 4 256 +3.161572e+06 8 256 +5.204178e+06 16 256 +7.298549e+06 32 256 +9.351790e+06 64 256 +1.042379e+07 128 256 +1.124106e+07 256 256 +1.172530e+07 512 256 +1.198293e+07 1024 256 ### GPU: scaling test 32 -5.803167e+04 1 32 -1.141868e+05 2 32 -2.280709e+05 4 32 -4.392090e+05 8 32 -8.271820e+05 16 32 -1.628245e+06 32 32 -3.150764e+06 64 32 -5.031576e+06 128 32 -7.100399e+06 256 32 -9.298129e+06 512 32 -1.037459e+07 1024 32 -1.113939e+07 2048 32 -1.172028e+07 4096 32 -1.198120e+07 8192 32 +6.193820e+04 1 32 +1.226188e+05 2 32 +2.368752e+05 4 32 +5.021922e+05 8 32 +9.439632e+05 16 32 +1.830562e+06 32 32 +3.137385e+06 64 32 +5.156248e+06 128 32 +7.290552e+06 256 32 +9.388950e+06 512 32 +1.040608e+07 1024 32 +1.126030e+07 2048 32 +1.174572e+07 4096 32 +1.202371e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.715304e+05 1 256 -1.781417e+05 2 256 -1.794714e+05 4 256 +1.756279e+05 1 256 +1.708467e+05 2 256 +1.805656e+05 4 256 ### CPU: scaling test 32 -1.577069e+05 1 32 -1.683648e+05 2 32 -1.674260e+05 4 32 +1.708079e+05 1 32 +1.683643e+05 2 32 +1.687250e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.985670e+05 1 256 -3.075757e+05 2 256 -3.131579e+05 4 256 +2.988416e+05 1 256 +3.107954e+05 2 256 +3.121627e+05 4 256 ### CPU: scaling test 32 -2.725469e+05 1 32 -2.816294e+05 2 32 -2.958942e+05 4 32 +3.001792e+05 1 32 +2.770191e+05 2 32 +2.925173e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.247762e+05 1 256 -5.241155e+05 2 256 -4.852917e+05 4 256 +5.229111e+05 1 256 +5.247396e+05 2 256 +5.278955e+05 4 256 ### CPU: scaling test 32 -5.186974e+05 1 32 -5.291399e+05 2 32 -5.305920e+05 4 32 +5.195818e+05 1 32 +5.314423e+05 2 32 +5.143992e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.514805e+05 1 256 -5.505359e+05 2 256 -5.563984e+05 4 256 +4.871597e+05 1 256 +5.527701e+05 2 256 +5.524901e+05 4 256 ### CPU: scaling test 32 -5.060969e+05 1 32 -5.545783e+05 2 32 -4.913100e+05 4 32 +3.862169e+05 1 32 +5.558161e+05 2 32 +5.598149e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.339783e+05 1 256 -3.535899e+05 2 256 -3.481939e+05 4 256 +3.496709e+05 1 256 +3.478363e+05 2 256 +3.569865e+05 4 256 ### CPU: scaling test 32 -3.145334e+05 1 32 -3.563455e+05 2 32 -3.387686e+05 4 32 +3.090563e+05 1 32 +3.410914e+05 2 32 +3.496952e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt index 898eec66e3..834c85aca0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:50:32 +DATE: 2025-12-07_18:07:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.041344e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200767e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.210879e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.008485e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.143139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.151918e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.316417 sec - 4,841,050,091 cycles # 2.845 GHz - 6,855,412,132 instructions # 1.42 insn per cycle - 1.762497593 seconds time elapsed +TOTAL : 1.960092 sec + 4,988,972,410 cycles # 2.891 GHz + 6,966,822,313 instructions # 1.40 insn per cycle + 2.406937451 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.782393e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.828671e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.828671e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.843073e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.889585e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.889585e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.991425 sec - 17,268,124,515 cycles # 2.880 GHz - 46,321,023,545 instructions # 2.68 insn per cycle - 5.996950400 seconds time elapsed +TOTAL : 5.794348 sec + 17,272,428,285 cycles # 2.979 GHz + 46,324,546,046 instructions # 2.68 insn per cycle + 5.799865657 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.120284e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.273768e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.273768e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.210660e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.370489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.370489e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.468964 sec - 10,062,208,508 cycles # 2.897 GHz - 27,919,768,700 instructions # 2.77 insn per cycle - 3.474512429 seconds time elapsed +TOTAL : 3.371604 sec + 10,058,951,785 cycles # 2.980 GHz + 27,919,382,257 instructions # 2.78 insn per cycle + 3.376985819 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.922035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.300092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.300092e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.995371e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.384935e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.384935e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.238317 sec - 6,090,888,500 cycles # 2.716 GHz - 12,608,791,480 instructions # 2.07 insn per cycle - 2.243747530 seconds time elapsed +TOTAL : 2.205695 sec + 6,087,503,688 cycles # 2.755 GHz + 12,610,314,859 instructions # 2.07 insn per cycle + 2.210929439 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.153909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.564898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.564898e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.327292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.757722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.757722e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.141769 sec - 5,839,015,371 cycles # 2.721 GHz - 12,183,200,067 instructions # 2.09 insn per cycle - 2.147164385 seconds time elapsed +TOTAL : 2.073643 sec + 5,833,634,503 cycles # 2.807 GHz + 12,186,499,709 instructions # 2.09 insn per cycle + 2.078884728 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.421281e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.595508e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.595508e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.512867e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.692825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.692825e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.172923 sec - 5,704,193,065 cycles # 1.795 GHz - 8,277,048,290 instructions # 1.45 insn per cycle - 3.178502846 seconds time elapsed +TOTAL : 3.090217 sec + 5,730,288,981 cycles # 1.852 GHz + 8,277,958,227 instructions # 1.44 insn per cycle + 3.095407678 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 8fbb21e9ff..628978faff 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:28:38 +DATE: 2025-12-07_18:48:24 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.427555e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.769300e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.769300e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.347416e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.613159e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.613159e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.828718 sec - 3,186,820,693 cycles # 2.852 GHz - 4,808,126,394 instructions # 1.51 insn per cycle - 1.176249753 seconds time elapsed +TOTAL : 0.828674 sec + 3,176,260,796 cycles # 2.867 GHz + 4,820,452,418 instructions # 1.52 insn per cycle + 1.165054369 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.774052e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.819717e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.819717e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.811735e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.858008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.858008e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.098613 sec - 17,597,864,140 cycles # 2.883 GHz - 46,380,415,047 instructions # 2.64 insn per cycle - 6.105859903 seconds time elapsed +TOTAL : 5.973801 sec + 17,609,259,337 cycles # 2.945 GHz + 46,380,461,505 instructions # 2.63 insn per cycle + 5.980829379 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.088043e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238153e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238153e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.151959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.306778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.306778e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.585879 sec - 10,400,318,731 cycles # 2.896 GHz - 28,093,070,719 instructions # 2.70 insn per cycle - 3.593178065 seconds time elapsed +TOTAL : 3.512283 sec + 10,383,475,141 cycles # 2.951 GHz + 28,093,657,867 instructions # 2.71 insn per cycle + 3.519198431 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.807610e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.170791e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.170791e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.920912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.295732e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.295732e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.371916 sec - 6,428,829,911 cycles # 2.703 GHz - 12,887,812,684 instructions # 2.00 insn per cycle - 2.379156266 seconds time elapsed +TOTAL : 2.319591 sec + 6,457,332,515 cycles # 2.777 GHz + 12,887,869,878 instructions # 2.00 insn per cycle + 2.326484377 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.017593e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.406809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.406809e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.155839e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.566234e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.566234e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.281231 sec - 6,165,327,004 cycles # 2.695 GHz - 12,463,334,301 instructions # 2.02 insn per cycle - 2.288346369 seconds time elapsed +TOTAL : 2.221931 sec + 6,208,706,341 cycles # 2.787 GHz + 12,462,200,359 instructions # 2.01 insn per cycle + 2.228888893 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.356453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.524615e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.524615e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.451581e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.626912e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.626912e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.315612 sec - 6,121,266,749 cycles # 1.843 GHz - 8,516,898,541 instructions # 1.39 insn per cycle - 3.322530830 seconds time elapsed +TOTAL : 3.225905 sec + 6,090,784,165 cycles # 1.885 GHz + 8,513,272,685 instructions # 1.40 insn per cycle + 3.232765547 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 26e0f25894..78afb02980 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:44:00 +DATE: 2025-12-07_19:03:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.725056e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.186541e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.580567e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.749033e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.166657e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.571286e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.638610 sec - 2,571,549,393 cycles # 2.847 GHz - 3,659,796,797 instructions # 1.42 insn per cycle - 0.960427498 seconds time elapsed +TOTAL : 0.638014 sec + 2,556,790,415 cycles # 2.840 GHz + 3,662,730,188 instructions # 1.43 insn per cycle + 0.956947939 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.781185e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.826305e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.826305e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.847887e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.894450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894450e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 6.057966 sec - 17,438,379,118 cycles # 2.877 GHz - 46,337,653,518 instructions # 2.66 insn per cycle - 6.063608366 seconds time elapsed +TOTAL : 5.839142 sec + 17,459,486,822 cycles # 2.988 GHz + 46,337,419,167 instructions # 2.65 insn per cycle + 5.844551162 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.115210e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.268081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.268081e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.206514e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.365518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.365518e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.536392 sec - 10,229,702,343 cycles # 2.889 GHz - 27,918,943,570 instructions # 2.73 insn per cycle - 3.542208033 seconds time elapsed +TOTAL : 3.437357 sec + 10,251,116,586 cycles # 2.979 GHz + 27,919,544,360 instructions # 2.72 insn per cycle + 3.442794591 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.877271e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.247954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.247954e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.103607e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.502534e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.502534e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.320644 sec - 6,288,847,916 cycles # 2.704 GHz - 12,592,903,872 instructions # 2.00 insn per cycle - 2.326302778 seconds time elapsed +TOTAL : 2.220265 sec + 6,280,977,368 cycles # 2.823 GHz + 12,592,927,228 instructions # 2.00 insn per cycle + 2.225817067 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.123817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.531393e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.531393e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.344221e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.775800e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.775800e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.218321 sec - 6,014,515,797 cycles # 2.706 GHz - 12,133,309,602 instructions # 2.02 insn per cycle - 2.224085333 seconds time elapsed +TOTAL : 2.128321 sec + 6,013,947,593 cycles # 2.820 GHz + 12,133,979,564 instructions # 2.02 insn per cycle + 2.133542684 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.381723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.553268e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.553268e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.546021e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.728735e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.728735e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.273257 sec - 5,933,511,412 cycles # 1.811 GHz - 8,229,034,215 instructions # 1.39 insn per cycle - 3.278919832 seconds time elapsed +TOTAL : 3.123060 sec + 5,917,588,491 cycles # 1.892 GHz + 8,226,533,903 instructions # 1.39 insn per cycle + 3.128704573 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 4d5855b54d..714dfd2e9c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:40:27 +DATE: 2025-12-07_18:59:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.767730e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.205228e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.589097e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.756565e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.192192e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.604383e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.575649 sec - 2,386,111,811 cycles # 2.845 GHz - 3,639,741,256 instructions # 1.53 insn per cycle - 0.895952286 seconds time elapsed +TOTAL : 0.574036 sec + 2,349,966,103 cycles # 2.814 GHz + 3,637,169,909 instructions # 1.55 insn per cycle + 0.892619396 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.791051e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.837013e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.837013e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.846381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893217e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893217e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.963163 sec - 17,264,643,304 cycles # 2.893 GHz - 46,321,097,140 instructions # 2.68 insn per cycle - 5.968989618 seconds time elapsed +TOTAL : 5.783609 sec + 17,275,173,441 cycles # 2.985 GHz + 46,323,242,988 instructions # 2.68 insn per cycle + 5.789056566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.101295e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.253753e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.253753e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.215303e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.374209e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.374209e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.491410 sec - 10,059,054,482 cycles # 2.877 GHz - 27,919,466,540 instructions # 2.78 insn per cycle - 3.497008176 seconds time elapsed +TOTAL : 3.367777 sec + 10,063,050,391 cycles # 2.985 GHz + 27,919,564,319 instructions # 2.77 insn per cycle + 3.373206810 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.890079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.263113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.263113e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.104997e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.500300e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.500300e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.254459 sec - 6,084,381,375 cycles # 2.693 GHz - 12,610,002,661 instructions # 2.07 insn per cycle - 2.260263260 seconds time elapsed +TOTAL : 2.159822 sec + 6,087,923,018 cycles # 2.813 GHz + 12,610,130,845 instructions # 2.07 insn per cycle + 2.165317701 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.141713e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.554289e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.554289e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.181697e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.587894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.587894e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.147865 sec - 5,852,500,330 cycles # 2.720 GHz - 12,186,332,321 instructions # 2.08 insn per cycle - 2.153550767 seconds time elapsed +TOTAL : 2.129696 sec + 5,842,486,902 cycles # 2.737 GHz + 12,185,192,045 instructions # 2.09 insn per cycle + 2.135143705 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.413552e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.588205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.588205e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.613657e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.806093e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806093e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.180124 sec - 5,723,407,148 cycles # 1.797 GHz - 8,277,947,646 instructions # 1.45 insn per cycle - 3.185775207 seconds time elapsed +TOTAL : 3.007646 sec + 5,728,390,385 cycles # 1.902 GHz + 8,277,715,045 instructions # 1.45 insn per cycle + 3.013179292 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt index 4b28e0c827..105e732bd3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:49:10 +DATE: 2025-12-07_19:15:20 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.755096e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.215389e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.607884e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.746180e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.148806e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.553115e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.539292 sec - 2,216,200,050 cycles # 2.846 GHz - 3,157,615,309 instructions # 1.42 insn per cycle - 0.835257331 seconds time elapsed +TOTAL : 0.532755 sec + 2,242,697,063 cycles # 2.913 GHz + 3,229,531,145 instructions # 1.44 insn per cycle + 0.826514046 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.787183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.832888e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.832888e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.820864e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.866564e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.866564e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.975964 sec - 17,260,345,803 cycles # 2.886 GHz - 46,320,336,029 instructions # 2.68 insn per cycle - 5.981639118 seconds time elapsed +TOTAL : 5.864284 sec + 17,275,079,846 cycles # 2.944 GHz + 46,322,210,908 instructions # 2.68 insn per cycle + 5.869624522 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.111247e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.265577e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.265577e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.219530e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.379931e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.379931e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.479269 sec - 10,044,184,434 cycles # 2.883 GHz - 27,919,122,564 instructions # 2.78 insn per cycle - 3.485095741 seconds time elapsed +TOTAL : 3.362131 sec + 10,051,335,054 cycles # 2.986 GHz + 27,920,745,588 instructions # 2.78 insn per cycle + 3.367389056 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.905590e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.283676e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.283676e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.047808e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.441366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.441366e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.245986 sec - 6,089,248,282 cycles # 2.705 GHz - 12,609,705,263 instructions # 2.07 insn per cycle - 2.251881277 seconds time elapsed +TOTAL : 2.184282 sec + 6,107,536,822 cycles # 2.791 GHz + 12,610,307,454 instructions # 2.06 insn per cycle + 2.189590856 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.148141e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.559740e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.559740e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.322998e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.750632e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.750632e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.144804 sec - 5,824,946,914 cycles # 2.710 GHz - 12,184,657,847 instructions # 2.09 insn per cycle - 2.150527846 seconds time elapsed +TOTAL : 2.075401 sec + 5,842,465,304 cycles # 2.809 GHz + 12,186,466,591 instructions # 2.09 insn per cycle + 2.080764137 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.423895e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.599460e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.599460e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.568052e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.752448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.752448e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.171890 sec - 5,741,396,850 cycles # 1.808 GHz - 8,278,034,433 instructions # 1.44 insn per cycle - 3.177718293 seconds time elapsed +TOTAL : 3.044526 sec + 5,754,253,930 cycles # 1.887 GHz + 8,277,598,608 instructions # 1.44 insn per cycle + 3.049924237 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index e5e06f1218..b96ad53f4f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:37:03 +DATE: 2025-12-07_18:56:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.626435e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.214094e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.587498e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.715021e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.178689e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.585270e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.726364 sec - 2,849,514,717 cycles # 2.845 GHz - 4,382,574,758 instructions # 1.54 insn per cycle - 1.057928884 seconds time elapsed +TOTAL : 0.716772 sec + 2,848,753,634 cycles # 2.913 GHz + 4,423,570,000 instructions # 1.55 insn per cycle + 1.035480801 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.789888e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.835303e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.835303e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.843797e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.891037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.891037e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.967334 sec - 17,272,703,409 cycles # 2.893 GHz - 46,321,862,531 instructions # 2.68 insn per cycle - 5.973038452 seconds time elapsed +TOTAL : 5.792528 sec + 17,266,428,402 cycles # 2.979 GHz + 46,321,589,992 instructions # 2.68 insn per cycle + 5.798095100 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.088498e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238712e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.176716e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.331063e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331063e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.504822 sec - 10,065,494,953 cycles # 2.868 GHz - 27,919,546,717 instructions # 2.77 insn per cycle - 3.510554362 seconds time elapsed +TOTAL : 3.405780 sec + 10,046,602,468 cycles # 2.946 GHz + 27,920,648,997 instructions # 2.78 insn per cycle + 3.411241970 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.895401e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.272281e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.272281e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.091422e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.489746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.489746e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.251790 sec - 6,086,448,139 cycles # 2.697 GHz - 12,610,253,243 instructions # 2.07 insn per cycle - 2.257658692 seconds time elapsed +TOTAL : 2.164260 sec + 6,102,819,668 cycles # 2.814 GHz + 12,610,158,980 instructions # 2.07 insn per cycle + 2.169564540 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.104544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.508827e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.508827e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.314364e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.747138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.747138e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.163370 sec - 5,848,310,473 cycles # 2.697 GHz - 12,186,147,335 instructions # 2.08 insn per cycle - 2.169166916 seconds time elapsed +TOTAL : 2.078995 sec + 5,845,519,229 cycles # 2.806 GHz + 12,186,886,441 instructions # 2.08 insn per cycle + 2.084393146 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.395329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569447e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569447e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.561890e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.747607e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.747607e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.198349 sec - 5,734,393,208 cycles # 1.791 GHz - 8,277,908,197 instructions # 1.44 insn per cycle - 3.204254400 seconds time elapsed +TOTAL : 3.050907 sec + 5,740,892,854 cycles # 1.879 GHz + 8,277,576,142 instructions # 1.44 insn per cycle + 3.056401656 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 09986e5034..793ecf7fab 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:17:41 +DATE: 2025-12-07_17:35:20 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.740251e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.070566e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.446622e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.878688e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.102567e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.467218e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.542467 sec - 2,308,061,310 cycles # 2.843 GHz - 3,180,365,192 instructions # 1.38 insn per cycle - 0.870299018 seconds time elapsed +TOTAL : 0.531753 sec + 2,297,890,140 cycles # 2.896 GHz + 3,225,329,047 instructions # 1.40 insn per cycle + 0.851888185 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.832732e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.880113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.880113e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.898983e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.948450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.948450e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.829901 sec - 16,848,535,293 cycles # 2.888 GHz - 45,296,509,977 instructions # 2.69 insn per cycle - 5.835776505 seconds time elapsed +TOTAL : 5.626242 sec + 16,852,728,675 cycles # 2.993 GHz + 45,296,927,324 instructions # 2.69 insn per cycle + 5.631568522 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.271423e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.440008e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.440008e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.390365e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566395e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566395e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.314065 sec - 9,572,123,137 cycles # 2.885 GHz - 26,751,815,901 instructions # 2.79 insn per cycle - 3.319563861 seconds time elapsed +TOTAL : 3.198092 sec + 9,569,393,167 cycles # 2.988 GHz + 26,752,118,701 instructions # 2.80 insn per cycle + 3.203173922 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.514184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.827414e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.827414e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.674769e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.996063e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.996063e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.431404 sec - 6,623,808,841 cycles # 2.719 GHz - 14,177,690,165 instructions # 2.14 insn per cycle - 2.437208264 seconds time elapsed +TOTAL : 2.348069 sec + 6,607,936,071 cycles # 2.809 GHz + 14,177,724,315 instructions # 2.15 insn per cycle + 2.353185076 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2724) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.701345e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.040507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.040507e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.794812e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.137640e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.137640e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.338470 sec - 6,401,665,095 cycles # 2.732 GHz - 13,769,940,318 instructions # 2.15 insn per cycle - 2.344318448 seconds time elapsed +TOTAL : 2.291933 sec + 6,391,736,532 cycles # 2.784 GHz + 13,770,771,772 instructions # 2.15 insn per cycle + 2.297066829 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2371) (512y: 297) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.303189e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.466084e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.466084e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.441814e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.613720e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.613720e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.283375 sec - 5,957,178,129 cycles # 1.812 GHz - 10,086,124,192 instructions # 1.69 insn per cycle - 3.289028880 seconds time elapsed +TOTAL : 3.151201 sec + 5,916,807,773 cycles # 1.875 GHz + 10,086,529,937 instructions # 1.70 insn per cycle + 3.156238125 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1276) (512y: 208) (512z: 1988) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 0d42001848..9b5f78ee27 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:18:17 +DATE: 2025-12-07_18:38:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.785771e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.171465e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.568632e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.608680e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.126787e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.562960e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.539437 sec - 2,324,660,140 cycles # 2.833 GHz - 3,221,828,743 instructions # 1.39 insn per cycle - 0.878217469 seconds time elapsed +TOTAL : 0.537334 sec + 2,373,374,326 cycles # 2.898 GHz + 3,305,657,476 instructions # 1.39 insn per cycle + 0.875244058 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.387107e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.469288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.469288e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.438579e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.521661e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.521661e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.501541 sec - 13,071,399,497 cycles # 2.901 GHz - 34,739,078,110 instructions # 2.66 insn per cycle - 4.507191858 seconds time elapsed +TOTAL : 4.406287 sec + 13,102,419,231 cycles # 2.970 GHz + 34,738,898,085 instructions # 2.65 insn per cycle + 4.412013798 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 648) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.901021e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.033616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.033616e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.950606e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.087796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.087796e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.723435 sec - 10,832,687,449 cycles # 2.906 GHz - 24,282,426,073 instructions # 2.24 insn per cycle - 3.728894903 seconds time elapsed +TOTAL : 3.662094 sec + 10,869,927,351 cycles # 2.964 GHz + 24,282,901,892 instructions # 2.23 insn per cycle + 3.667822592 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2579) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.388729e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.690145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.690145e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.571747e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.885870e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.885870e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.497295 sec - 6,743,813,449 cycles # 2.696 GHz - 12,543,269,382 instructions # 1.86 insn per cycle - 2.502704497 seconds time elapsed +TOTAL : 2.400060 sec + 6,745,065,598 cycles # 2.805 GHz + 12,543,779,660 instructions # 1.86 insn per cycle + 2.405766016 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3156) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.651146e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.006867e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.006867e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.886609e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.247588e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.247588e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.362181 sec - 6,370,126,838 cycles # 2.692 GHz - 11,708,850,355 instructions # 1.84 insn per cycle - 2.367368593 seconds time elapsed +TOTAL : 2.252561 sec + 6,358,554,233 cycles # 2.817 GHz + 11,706,922,355 instructions # 1.84 insn per cycle + 2.258154979 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2674) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.672883e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.874095e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.874095e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.881607e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.102647e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.102647e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.962382 sec - 5,387,973,040 cycles # 1.816 GHz - 9,344,687,874 instructions # 1.73 insn per cycle - 2.967757912 seconds time elapsed +TOTAL : 2.808935 sec + 5,386,176,175 cycles # 1.914 GHz + 9,345,011,726 instructions # 1.73 insn per cycle + 2.814494105 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2107) (512y: 282) (512z: 1954) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 1f895c929f..def4b3f0d8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:18:48 +DATE: 2025-12-07_18:39:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.773620e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.074692e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.456461e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.572553e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.018066e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.444811e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.534811 sec - 2,266,123,133 cycles # 2.828 GHz - 3,168,944,538 instructions # 1.40 insn per cycle - 0.857996121 seconds time elapsed +TOTAL : 0.534132 sec + 2,337,052,817 cycles # 2.917 GHz + 3,262,251,196 instructions # 1.40 insn per cycle + 0.857930400 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.506524e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.597769e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.597769e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.577044e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.669503e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.669503e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.291386 sec - 12,399,672,738 cycles # 2.887 GHz - 35,290,415,137 instructions # 2.85 insn per cycle - 4.296907910 seconds time elapsed +TOTAL : 4.175004 sec + 12,431,322,463 cycles # 2.974 GHz + 35,290,548,133 instructions # 2.84 insn per cycle + 4.180683377 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.891328e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.022776e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.022776e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.998292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.136612e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.136612e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.735496 sec - 10,767,908,972 cycles # 2.879 GHz - 23,493,099,341 instructions # 2.18 insn per cycle - 3.741023923 seconds time elapsed +TOTAL : 3.603420 sec + 10,775,759,358 cycles # 2.987 GHz + 23,493,324,442 instructions # 2.18 insn per cycle + 3.609029941 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.929407e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312189e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.312189e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.126423e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.529668e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.529668e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.235559 sec - 6,081,264,505 cycles # 2.715 GHz - 12,002,246,039 instructions # 1.97 insn per cycle - 2.240973571 seconds time elapsed +TOTAL : 2.152135 sec + 6,059,373,548 cycles # 2.810 GHz + 12,002,088,580 instructions # 1.98 insn per cycle + 2.157738103 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.860705e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.225389e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.225389e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.012802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.388358e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.388358e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.264729 sec - 6,145,018,402 cycles # 2.708 GHz - 11,235,762,297 instructions # 1.83 insn per cycle - 2.270329967 seconds time elapsed +TOTAL : 2.197404 sec + 6,189,315,214 cycles # 2.811 GHz + 11,238,842,469 instructions # 1.82 insn per cycle + 2.202943349 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2110) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.696752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.901055e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.901055e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.024264e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.263071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.263071e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.944494 sec - 5,239,165,595 cycles # 1.777 GHz - 9,095,766,728 instructions # 1.74 insn per cycle - 2.949694561 seconds time elapsed +TOTAL : 2.711042 sec + 5,186,071,903 cycles # 1.910 GHz + 9,092,984,657 instructions # 1.75 insn per cycle + 2.716697739 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1638) (512y: 208) (512z: 1583) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling index 70eb313ac9..4ce1d4e5cb 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:41:21 +DATE: 2025-12-07_17:58:11 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.475062e+06 1 256 -3.218486e+06 2 256 -5.903821e+06 4 256 -1.165716e+07 8 256 -2.454885e+07 16 256 -4.527393e+07 32 256 -8.391766e+07 64 256 -1.334550e+08 128 256 -1.552485e+08 256 256 -1.694983e+08 512 256 -1.849571e+08 1024 256 +1.551007e+06 1 256 +3.183466e+06 2 256 +6.478921e+06 4 256 +1.126265e+07 8 256 +2.482063e+07 16 256 +4.808415e+07 32 256 +8.741350e+07 64 256 +1.324688e+08 128 256 +1.544939e+08 256 256 +1.712648e+08 512 256 +1.853915e+08 1024 256 ### GPU: scaling test 32 -1.882231e+05 1 32 -4.016921e+05 2 32 -8.022815e+05 4 32 -1.595811e+06 8 32 -3.056260e+06 16 32 -6.326142e+06 32 32 -1.208794e+07 64 32 -2.463478e+07 128 32 -4.741756e+07 256 32 -9.093281e+07 512 32 -1.150905e+08 1024 32 -1.344888e+08 2048 32 -1.543860e+08 4096 32 -1.683918e+08 8192 32 +1.853192e+05 1 32 +4.145454e+05 2 32 +7.406593e+05 4 32 +1.706826e+06 8 32 +3.221422e+06 16 32 +6.244626e+06 32 32 +1.278514e+07 64 32 +2.592192e+07 128 32 +4.923846e+07 256 32 +8.698745e+07 512 32 +1.193773e+08 1024 32 +1.333336e+08 2048 32 +1.559404e+08 4096 32 +1.682225e+08 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.843216e+05 1 256 -1.897524e+05 2 256 -1.896027e+05 4 256 +1.815029e+05 1 256 +1.884920e+05 2 256 +1.897783e+05 4 256 ### CPU: scaling test 32 -1.666589e+05 1 32 -1.669510e+05 2 32 -1.791277e+05 4 32 +1.800312e+05 1 32 +1.564016e+05 2 32 +1.776926e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.321762e+05 1 256 -4.399797e+05 2 256 -4.577304e+05 4 256 +4.387168e+05 1 256 +4.267079e+05 2 256 +4.584246e+05 4 256 ### CPU: scaling test 32 -4.375351e+05 1 32 -3.779245e+05 2 32 -4.181545e+05 4 32 +4.365621e+05 1 32 +4.578526e+05 2 32 +4.358114e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.280541e+05 1 256 -9.070263e+05 2 256 -9.020254e+05 4 256 +9.255409e+05 1 256 +9.053310e+05 2 256 +9.165969e+05 4 256 ### CPU: scaling test 32 -8.873360e+05 1 32 -9.140769e+05 2 32 -9.224693e+05 4 32 +8.894077e+05 1 32 +8.278360e+05 2 32 +8.629757e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.444090e+05 1 256 -9.480587e+05 2 256 -9.506189e+05 4 256 +9.343987e+05 1 256 +9.659539e+05 2 256 +9.383691e+05 4 256 ### CPU: scaling test 32 -9.250159e+05 1 32 -9.436188e+05 2 32 -9.553023e+05 4 32 +9.208368e+05 1 32 +9.440224e+05 2 32 +8.195672e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.540106e+05 1 256 -6.620410e+05 2 256 -6.781399e+05 4 256 +6.745700e+05 1 256 +6.577773e+05 2 256 +6.506218e+05 4 256 ### CPU: scaling test 32 -5.655809e+05 1 32 -5.425522e+05 2 32 -6.546076e+05 4 32 +5.630334e+05 1 32 +6.220961e+05 2 32 +6.555329e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 29a4ea8877..ff84735ed7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:19:12 +DATE: 2025-12-07_17:36:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.227728e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.785385e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.924249e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.673906e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.802745e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927137e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.492304 sec - 2,118,504,146 cycles # 2.819 GHz - 2,963,870,047 instructions # 1.40 insn per cycle - 0.808747497 seconds time elapsed +TOTAL : 0.484316 sec + 2,139,961,069 cycles # 2.896 GHz + 2,990,937,596 instructions # 1.40 insn per cycle + 0.795961544 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.880677e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.933319e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.933319e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.943815e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.998126e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.998126e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.662756 sec - 16,361,560,744 cycles # 2.887 GHz - 45,526,236,392 instructions # 2.78 insn per cycle - 5.668346367 seconds time elapsed +TOTAL : 5.477041 sec + 16,362,480,071 cycles # 2.986 GHz + 45,526,702,537 instructions # 2.78 insn per cycle + 5.482054037 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.414646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.739659e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.739659e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.565308e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.903173e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.903173e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.463879 sec - 7,092,934,877 cycles # 2.874 GHz - 17,852,493,922 instructions # 2.52 insn per cycle - 2.469325378 seconds time elapsed +TOTAL : 2.380915 sec + 7,097,395,142 cycles # 2.976 GHz + 17,853,320,602 instructions # 2.52 insn per cycle + 2.385950404 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.208525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.313027e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.313027e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.454860e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.591471e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.591471e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.365011 sec - 3,747,283,623 cycles # 2.735 GHz - 8,291,354,119 instructions # 2.21 insn per cycle - 1.370608034 seconds time elapsed +TOTAL : 1.322984 sec + 3,740,449,961 cycles # 2.818 GHz + 8,292,007,767 instructions # 2.22 insn per cycle + 1.328007829 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.454543e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.612605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.612605e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.606973e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.781586e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.781586e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.327433 sec - 3,648,803,599 cycles # 2.739 GHz - 8,020,246,707 instructions # 2.20 insn per cycle - 1.332943592 seconds time elapsed +TOTAL : 1.303244 sec + 3,643,191,529 cycles # 2.788 GHz + 8,020,070,672 instructions # 2.20 insn per cycle + 1.308085908 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.298741e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.918817e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.918817e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.586101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.259218e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.259218e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.753154 sec - 3,282,016,345 cycles # 1.867 GHz - 6,088,962,733 instructions # 1.86 insn per cycle - 1.758605907 seconds time elapsed +TOTAL : 1.676219 sec + 3,282,637,278 cycles # 1.954 GHz + 6,089,770,731 instructions # 1.86 insn per cycle + 1.681057576 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling index d76cec9169..31f07a6d7d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:56:13 +DATE: 2025-12-07_18:12:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -4.541979e+05 1 256 -9.203949e+05 2 256 -1.645855e+06 4 256 -3.099419e+06 8 256 -4.823113e+06 16 256 -7.898172e+06 32 256 -1.061455e+07 64 256 -1.233940e+07 128 256 -1.359197e+07 256 256 -1.426011e+07 512 256 -1.471228e+07 1024 256 +5.108679e+05 1 256 +8.739543e+05 2 256 +1.717516e+06 4 256 +3.274573e+06 8 256 +5.409096e+06 16 256 +7.832092e+06 32 256 +1.058441e+07 64 256 +1.262663e+07 128 256 +1.366544e+07 256 256 +1.428151e+07 512 256 +1.475807e+07 1024 256 ### GPU: scaling test 32 -5.695876e+04 1 32 -1.092163e+05 2 32 -2.189134e+05 4 32 -4.543656e+05 8 32 -8.666538e+05 16 32 -1.664792e+06 32 32 -3.023066e+06 64 32 -5.156183e+06 128 32 -7.621691e+06 256 32 -1.049897e+07 512 32 -1.232012e+07 1024 32 -1.355710e+07 2048 32 -1.432425e+07 4096 32 -1.475276e+07 8192 32 +6.170316e+04 1 32 +1.237643e+05 2 32 +2.414416e+05 4 32 +4.953158e+05 8 32 +9.507945e+05 16 32 +1.759707e+06 32 32 +3.264385e+06 64 32 +5.140195e+06 128 32 +8.303819e+06 256 32 +1.050310e+07 512 32 +1.258383e+07 1024 32 +1.358787e+07 2048 32 +1.436323e+07 4096 32 +1.474434e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.747944e+05 1 256 -1.817829e+05 2 256 -1.896771e+05 4 256 +1.629071e+05 1 256 +1.892127e+05 2 256 +1.873947e+05 4 256 ### CPU: scaling test 32 -1.728805e+05 1 32 -1.767946e+05 2 32 -1.762418e+05 4 32 +1.798895e+05 1 32 +1.808584e+05 2 32 +1.778430e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.997246e+05 1 256 -4.307310e+05 2 256 -4.464263e+05 4 256 +4.396782e+05 1 256 +4.226239e+05 2 256 +4.598426e+05 4 256 ### CPU: scaling test 32 -3.999600e+05 1 32 -3.699679e+05 2 32 -4.315766e+05 4 32 +4.064783e+05 1 32 +4.279362e+05 2 32 +4.245552e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.797794e+05 1 256 -8.305580e+05 2 256 -8.419045e+05 4 256 +9.110937e+05 1 256 +8.592421e+05 2 256 +9.143257e+05 4 256 ### CPU: scaling test 32 -8.881488e+05 1 32 -9.130727e+05 2 32 -9.232345e+05 4 32 +6.929557e+05 1 32 +9.083297e+05 2 32 +8.517264e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.581879e+05 1 256 -9.512415e+05 2 256 -9.501003e+05 4 256 +9.364837e+05 1 256 +9.653329e+05 2 256 +8.851470e+05 4 256 ### CPU: scaling test 32 -9.220574e+05 1 32 -9.420354e+05 2 32 -8.881180e+05 4 32 +9.158820e+05 1 32 +9.425210e+05 2 32 +9.546611e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.495302e+05 1 256 -6.782481e+05 2 256 -6.868630e+05 4 256 +5.766013e+05 1 256 +6.691472e+05 2 256 +6.748773e+05 4 256 ### CPU: scaling test 32 -5.595188e+05 1 32 -6.234779e+05 2 32 -6.548319e+05 4 32 +5.583278e+05 1 32 +6.169626e+05 2 32 +6.493243e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt index e92eb3813b..bafc8147a0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:51:48 +DATE: 2025-12-07_18:08:35 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.351930e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.489593e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498993e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.371513e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.493740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.501830e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 1.246737 sec - 4,579,068,239 cycles # 2.831 GHz - 6,336,239,576 instructions # 1.38 insn per cycle - 1.674994938 seconds time elapsed +TOTAL : 1.220251 sec + 4,662,362,888 cycles # 2.935 GHz + 6,467,361,446 instructions # 1.39 insn per cycle + 1.646519591 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.876691e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.929278e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.929278e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.943515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.997749e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.997749e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.673971 sec - 16,357,814,340 cycles # 2.881 GHz - 45,526,139,472 instructions # 2.78 insn per cycle - 5.679332523 seconds time elapsed +TOTAL : 5.479023 sec + 16,363,338,754 cycles # 2.985 GHz + 45,526,012,441 instructions # 2.78 insn per cycle + 5.484015297 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.428670e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.753669e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.753669e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.552253e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.890860e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.890860e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.455440 sec - 7,090,910,684 cycles # 2.883 GHz - 17,852,546,600 instructions # 2.52 insn per cycle - 2.460806632 seconds time elapsed +TOTAL : 2.389125 sec + 7,097,832,602 cycles # 2.966 GHz + 17,853,185,196 instructions # 2.52 insn per cycle + 2.394119860 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.063338e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.125894e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.125894e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.417276e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.556671e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.556671e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.386534 sec - 3,756,179,949 cycles # 2.700 GHz - 8,291,185,200 instructions # 2.21 insn per cycle - 1.391900760 seconds time elapsed +TOTAL : 1.329019 sec + 3,752,657,152 cycles # 2.815 GHz + 8,292,075,361 instructions # 2.21 insn per cycle + 1.334075636 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.396585e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.545366e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.545366e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.716101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.938008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.938008e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.336868 sec - 3,642,317,678 cycles # 2.716 GHz - 8,019,205,916 instructions # 2.20 insn per cycle - 1.344058514 seconds time elapsed +TOTAL : 1.288014 sec + 3,652,488,576 cycles # 2.827 GHz + 8,019,739,053 instructions # 2.20 insn per cycle + 1.292878885 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.310834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.934764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.934764e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.660384e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.336503e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.336503e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.748608 sec - 3,284,552,833 cycles # 1.874 GHz - 6,088,622,803 instructions # 1.85 insn per cycle - 1.753990283 seconds time elapsed +TOTAL : 1.658781 sec + 3,286,101,339 cycles # 1.977 GHz + 6,089,303,471 instructions # 1.85 insn per cycle + 1.663650354 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 3e1eb5adfb..8363282942 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:29:11 +DATE: 2025-12-07_18:48:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.961069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.550509e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.550509e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.876177e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.300570e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.300570e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.685895 sec - 2,724,461,027 cycles # 2.849 GHz - 4,115,491,673 instructions # 1.51 insn per cycle - 1.013379386 seconds time elapsed +TOTAL : 0.684003 sec + 2,764,378,858 cycles # 2.904 GHz + 4,165,630,646 instructions # 1.51 insn per cycle + 1.009743594 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.879765e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.932625e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.932625e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.929157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.983009e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.983009e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.709270 sec - 16,545,315,698 cycles # 2.895 GHz - 45,565,469,143 instructions # 2.75 insn per cycle - 5.715931822 seconds time elapsed +TOTAL : 5.564914 sec + 16,556,726,858 cycles # 2.973 GHz + 45,565,031,394 instructions # 2.75 insn per cycle + 5.571477248 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.377287e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.696132e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.696132e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.500331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.836584e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.836584e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.532029 sec - 7,290,698,661 cycles # 2.873 GHz - 18,128,482,182 instructions # 2.49 insn per cycle - 2.538964767 seconds time elapsed +TOTAL : 2.462682 sec + 7,305,440,046 cycles # 2.960 GHz + 18,128,717,387 instructions # 2.48 insn per cycle + 2.469059928 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.010327e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.072284e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.072284e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.232294e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.324543e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.324543e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.445098 sec - 3,968,422,684 cycles # 2.734 GHz - 8,524,408,845 instructions # 2.15 insn per cycle - 1.452187655 seconds time elapsed +TOTAL : 1.405458 sec + 3,956,946,679 cycles # 2.804 GHz + 8,524,862,161 instructions # 2.15 insn per cycle + 1.411874076 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.285117e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.425187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.425187e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.241282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.364460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.364460e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.403001 sec - 3,860,651,396 cycles # 2.740 GHz - 8,252,993,133 instructions # 2.14 insn per cycle - 1.409829697 seconds time elapsed +TOTAL : 1.407611 sec + 3,848,321,931 cycles # 2.724 GHz + 8,254,066,046 instructions # 2.14 insn per cycle + 1.413960566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.256834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.869079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.869079e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.440812e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.077879e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.077879e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.813530 sec - 3,488,089,376 cycles # 1.917 GHz - 6,339,016,347 instructions # 1.82 insn per cycle - 1.820470769 seconds time elapsed +TOTAL : 1.763210 sec + 3,494,208,748 cycles # 1.976 GHz + 6,339,716,318 instructions # 1.81 insn per cycle + 1.769645097 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 001fd1b5e8..f2850b15f1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:44:30 +DATE: 2025-12-07_19:03:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.384623e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.781787e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923075e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.318599e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799033e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932405e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.586690 sec - 2,388,718,169 cycles # 2.838 GHz - 3,423,003,931 instructions # 1.43 insn per cycle - 0.899326702 seconds time elapsed +TOTAL : 0.574956 sec + 2,417,856,443 cycles # 2.921 GHz + 3,455,951,021 instructions # 1.43 insn per cycle + 0.884968287 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.880714e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934194e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.934194e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.939652e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.994045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.994045e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 5.720004 sec - 16,536,660,388 cycles # 2.889 GHz - 45,556,960,525 instructions # 2.75 insn per cycle - 5.725324950 seconds time elapsed +TOTAL : 5.545408 sec + 16,554,388,934 cycles # 2.983 GHz + 45,555,350,772 instructions # 2.75 insn per cycle + 5.550490460 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.433465e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.759989e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.759989e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.593108e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.939572e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.939572e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.509292 sec - 7,256,957,374 cycles # 2.887 GHz - 17,864,987,256 instructions # 2.46 insn per cycle - 2.514536012 seconds time elapsed +TOTAL : 2.422572 sec + 7,271,757,726 cycles # 2.996 GHz + 17,865,255,004 instructions # 2.46 insn per cycle + 2.427839332 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.020309e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.092138e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.092138e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.461505e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.602929e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.602929e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.453461 sec - 3,918,315,703 cycles # 2.689 GHz - 8,275,994,533 instructions # 2.11 insn per cycle - 1.458689528 seconds time elapsed +TOTAL : 1.377916 sec + 3,914,536,895 cycles # 2.832 GHz + 8,275,793,493 instructions # 2.11 insn per cycle + 1.383072081 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.428992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.604343e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.604343e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.731699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.958087e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.958087e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.389726 sec - 3,813,398,977 cycles # 2.735 GHz - 7,970,393,641 instructions # 2.09 insn per cycle - 1.395086187 seconds time elapsed +TOTAL : 1.342772 sec + 3,819,937,290 cycles # 2.835 GHz + 7,971,132,655 instructions # 2.09 insn per cycle + 1.348030161 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.306240e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.928204e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.928204e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.561265e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.231697e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.231697e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.809723 sec - 3,457,472,821 cycles # 1.906 GHz - 6,039,803,289 instructions # 1.75 insn per cycle - 1.815214301 seconds time elapsed +TOTAL : 1.739869 sec + 3,459,326,393 cycles # 1.984 GHz + 6,040,645,507 instructions # 1.75 insn per cycle + 1.744987586 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index d6dd5599d5..3ea651924b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:40:59 +DATE: 2025-12-07_19:00:23 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.173088e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.784679e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.922376e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.250463e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799349e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932541e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.528664 sec - 2,228,192,580 cycles # 2.835 GHz - 3,376,529,061 instructions # 1.52 insn per cycle - 0.842332325 seconds time elapsed +TOTAL : 0.518144 sec + 2,245,751,348 cycles # 2.918 GHz + 3,458,311,159 instructions # 1.54 insn per cycle + 0.826284941 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.871432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.923569e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923569e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.939292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.993563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.993563e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.691172 sec - 16,369,213,744 cycles # 2.874 GHz - 45,526,750,504 instructions # 2.78 insn per cycle - 5.696402221 seconds time elapsed +TOTAL : 5.491090 sec + 16,370,308,470 cycles # 2.979 GHz + 45,525,854,880 instructions # 2.78 insn per cycle + 5.496248104 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.441693e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.769480e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.769480e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.536472e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.868379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.868379e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.448890 sec - 7,093,051,214 cycles # 2.891 GHz - 17,852,960,067 instructions # 2.52 insn per cycle - 2.454461827 seconds time elapsed +TOTAL : 2.397092 sec + 7,093,644,465 cycles # 2.954 GHz + 17,852,580,006 instructions # 2.52 insn per cycle + 2.402295589 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.163467e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.249025e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.249025e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.425447e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.570423e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.570423e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.371747 sec - 3,753,987,891 cycles # 2.728 GHz - 8,291,362,993 instructions # 2.21 insn per cycle - 1.377043835 seconds time elapsed +TOTAL : 1.330058 sec + 3,765,061,736 cycles # 2.821 GHz + 8,291,686,290 instructions # 2.20 insn per cycle + 1.335239019 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.404785e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.570601e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.570601e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.706825e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.920072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.920072e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.335938 sec - 3,649,997,495 cycles # 2.722 GHz - 8,019,382,433 instructions # 2.20 insn per cycle - 1.341456805 seconds time elapsed +TOTAL : 1.288803 sec + 3,655,112,623 cycles # 2.826 GHz + 8,019,410,691 instructions # 2.19 insn per cycle + 1.294118223 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.228574e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.840288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.840288e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.626100e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.290032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.290032e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.772330 sec - 3,277,054,131 cycles # 1.844 GHz - 6,089,082,639 instructions # 1.86 insn per cycle - 1.777760056 seconds time elapsed +TOTAL : 1.666882 sec + 3,285,120,528 cycles # 1.966 GHz + 6,089,161,256 instructions # 1.85 insn per cycle + 1.672121415 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt index 0ad3efbc84..0d2b097c6b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:50:09 +DATE: 2025-12-07_19:16:23 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.507701e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.798145e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.925897e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.288083e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.786376e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922912e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.495248 sec - 2,073,360,534 cycles # 2.817 GHz - 2,919,069,837 instructions # 1.41 insn per cycle - 0.794188547 seconds time elapsed +TOTAL : 0.491492 sec + 2,088,706,396 cycles # 2.906 GHz + 2,970,501,738 instructions # 1.42 insn per cycle + 0.776957474 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.871656e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.924156e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.924156e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.945957e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.000578e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.000578e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.690466 sec - 16,392,687,892 cycles # 2.879 GHz - 45,529,529,055 instructions # 2.78 insn per cycle - 5.695668537 seconds time elapsed +TOTAL : 5.472267 sec + 16,368,482,528 cycles # 2.989 GHz + 45,527,963,066 instructions # 2.78 insn per cycle + 5.477730193 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.439601e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.767131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.767131e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.586876e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.925262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.925262e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.449797 sec - 7,091,941,326 cycles # 2.890 GHz - 17,852,858,856 instructions # 2.52 insn per cycle - 2.455296966 seconds time elapsed +TOTAL : 2.371148 sec + 7,098,149,225 cycles # 2.988 GHz + 17,852,754,158 instructions # 2.52 insn per cycle + 2.376298200 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.145431e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.245108e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.245108e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.453616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.600648e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.600648e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.374709 sec - 3,766,055,040 cycles # 2.731 GHz - 8,291,749,848 instructions # 2.20 insn per cycle - 1.380351643 seconds time elapsed +TOTAL : 1.325459 sec + 3,755,160,951 cycles # 2.824 GHz + 8,292,360,077 instructions # 2.21 insn per cycle + 1.330550163 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.422664e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.588896e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.588896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.699150e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.902050e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.902050e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.332190 sec - 3,646,916,248 cycles # 2.728 GHz - 8,019,155,847 instructions # 2.20 insn per cycle - 1.337783089 seconds time elapsed +TOTAL : 1.290152 sec + 3,646,653,034 cycles # 2.817 GHz + 8,020,506,405 instructions # 2.20 insn per cycle + 1.295241940 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.310342e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.933915e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.933915e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.613617e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.283998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.283998e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.749833 sec - 3,289,282,662 cycles # 1.875 GHz - 6,089,226,401 instructions # 1.85 insn per cycle - 1.755424623 seconds time elapsed +TOTAL : 1.670294 sec + 3,278,553,803 cycles # 1.958 GHz + 6,088,923,475 instructions # 1.86 insn per cycle + 1.675448212 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 0d4e6e9f4e..132a896294 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:37:35 +DATE: 2025-12-07_18:57:05 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.371325e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.785294e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923320e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.665973e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.791556e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927198e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.635131 sec - 2,535,737,467 cycles # 2.824 GHz - 3,842,575,439 instructions # 1.52 insn per cycle - 0.954476643 seconds time elapsed +TOTAL : 0.624003 sec + 2,559,928,139 cycles # 2.920 GHz + 3,900,795,823 instructions # 1.52 insn per cycle + 0.933869327 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.876671e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.930263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.930263e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.931937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.985439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.985439e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.674874 sec - 16,371,341,972 cycles # 2.883 GHz - 45,526,097,275 instructions # 2.78 insn per cycle - 5.680145436 seconds time elapsed +TOTAL : 5.512311 sec + 16,371,008,689 cycles # 2.968 GHz + 45,526,873,330 instructions # 2.78 insn per cycle + 5.517468382 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.409852e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.733764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.733764e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.590738e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.932750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.932750e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.465466 sec - 7,089,429,077 cycles # 2.870 GHz - 17,852,779,482 instructions # 2.52 insn per cycle - 2.470998970 seconds time elapsed +TOTAL : 2.368386 sec + 7,096,620,103 cycles # 2.991 GHz + 17,852,891,458 instructions # 2.52 insn per cycle + 2.373525243 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.159709e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.263116e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.263116e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.451642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.578982e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.578982e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.372303 sec - 3,755,689,027 cycles # 2.728 GHz - 8,291,380,091 instructions # 2.21 insn per cycle - 1.377787541 seconds time elapsed +TOTAL : 1.324443 sec + 3,752,326,366 cycles # 2.825 GHz + 8,291,613,980 instructions # 2.21 insn per cycle + 1.329630243 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.407094e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.566877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.566877e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.661619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.864713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.864713e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.334826 sec - 3,652,466,006 cycles # 2.727 GHz - 8,020,599,017 instructions # 2.20 insn per cycle - 1.340268045 seconds time elapsed +TOTAL : 1.296028 sec + 3,649,027,143 cycles # 2.806 GHz + 8,019,590,845 instructions # 2.20 insn per cycle + 1.301017414 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.261859e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.880005e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.880005e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.571433e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.241356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.241356e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.763075 sec - 3,282,506,046 cycles # 1.857 GHz - 6,088,973,421 instructions # 1.85 insn per cycle - 1.768455658 seconds time elapsed +TOTAL : 1.680703 sec + 3,284,933,439 cycles # 1.950 GHz + 6,089,236,893 instructions # 1.85 insn per cycle + 1.685900188 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index e0e7f701d0..46918d2e37 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:19:36 +DATE: 2025-12-07_17:37:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.162146e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.783523e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914919e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.721813e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814545e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.938524e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.491426 sec - 2,125,746,364 cycles # 2.830 GHz - 2,979,109,571 instructions # 1.40 insn per cycle - 0.808584273 seconds time elapsed +TOTAL : 0.485209 sec + 2,140,401,482 cycles # 2.904 GHz + 3,007,684,015 instructions # 1.41 insn per cycle + 0.795460461 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.921360e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.976251e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.976251e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.959115e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.015378e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.015378e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.544826 sec - 16,047,528,517 cycles # 2.892 GHz - 44,602,173,132 instructions # 2.78 insn per cycle - 5.550245916 seconds time elapsed +TOTAL : 5.436721 sec + 16,060,069,248 cycles # 2.952 GHz + 44,603,416,333 instructions # 2.78 insn per cycle + 5.441623021 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 537) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.214945e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.668104e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.668104e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.331152e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.795717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.795717e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.098377 sec - 6,110,919,161 cycles # 2.906 GHz - 17,150,206,958 instructions # 2.81 insn per cycle - 2.103751937 seconds time elapsed +TOTAL : 2.050604 sec + 6,105,156,577 cycles # 2.971 GHz + 17,150,592,299 instructions # 2.81 insn per cycle + 2.055489400 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2861) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.851382e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.388872e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.388872e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.149583e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.736178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.736178e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.879565 sec - 5,032,467,533 cycles # 2.672 GHz - 10,256,120,490 instructions # 2.04 insn per cycle - 1.885016732 seconds time elapsed +TOTAL : 1.789880 sec + 5,038,618,409 cycles # 2.809 GHz + 10,256,206,067 instructions # 2.04 insn per cycle + 1.794849567 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3911) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.035975e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.607599e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.607599e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.198016e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.795541e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.795541e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.824491 sec - 4,977,961,454 cycles # 2.721 GHz - 10,027,255,295 instructions # 2.01 insn per cycle - 1.830117525 seconds time elapsed +TOTAL : 1.775359 sec + 4,974,875,613 cycles # 2.796 GHz + 10,026,959,347 instructions # 2.02 insn per cycle + 1.780381395 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3808) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.496582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.807885e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.807885e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.779828e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.118986e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.118986e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.420813 sec - 4,388,139,749 cycles # 1.809 GHz - 8,457,918,888 instructions # 1.93 insn per cycle - 2.426523884 seconds time elapsed +TOTAL : 2.277074 sec + 4,378,799,967 cycles # 1.920 GHz + 8,457,746,684 instructions # 1.93 insn per cycle + 2.281985703 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2749) (512y: 4) (512z: 2749) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index f0b80e260e..b9198cf358 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:19:19 +DATE: 2025-12-07_18:39:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.131628e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.790004e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927316e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.962103e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.767357e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.905981e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.492105 sec - 2,126,004,887 cycles # 2.830 GHz - 2,972,871,951 instructions # 1.40 insn per cycle - 0.808125336 seconds time elapsed +TOTAL : 0.491294 sec + 2,172,205,978 cycles # 2.909 GHz + 3,025,756,443 instructions # 1.39 insn per cycle + 0.805730150 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.361435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.444812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.444812e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.495975e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.587318e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.587318e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.526570 sec - 12,786,889,749 cycles # 2.822 GHz - 34,767,168,341 instructions # 2.72 insn per cycle - 4.531843724 seconds time elapsed +TOTAL : 4.285294 sec + 12,807,429,644 cycles # 2.986 GHz + 34,767,392,719 instructions # 2.71 insn per cycle + 4.290562514 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 649) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.142214e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.587894e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.587894e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.212790e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.670625e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.670625e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.126971 sec - 6,176,687,935 cycles # 2.898 GHz - 14,909,588,070 instructions # 2.41 insn per cycle - 2.132251600 seconds time elapsed +TOTAL : 2.098171 sec + 6,190,385,812 cycles # 2.945 GHz + 14,909,929,225 instructions # 2.41 insn per cycle + 2.104539075 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2978) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.053580e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.852260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.852260e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.314499e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.155824e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.155824e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.573119 sec - 4,286,494,919 cycles # 2.717 GHz - 9,134,727,561 instructions # 2.13 insn per cycle - 1.578532938 seconds time elapsed +TOTAL : 1.518870 sec + 4,293,709,263 cycles # 2.819 GHz + 9,134,695,364 instructions # 2.13 insn per cycle + 1.524396388 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4466) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.155196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.974374e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.974374e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.383476e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.242059e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.242059e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.552673 sec - 4,257,884,690 cycles # 2.734 GHz - 8,700,271,049 instructions # 2.04 insn per cycle - 1.558196136 seconds time elapsed +TOTAL : 1.505297 sec + 4,248,023,279 cycles # 2.813 GHz + 8,701,309,483 instructions # 2.05 insn per cycle + 1.510699557 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4224) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.246960e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.671205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.671205e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.537188e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.996292e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.996292e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.085797 sec - 3,847,204,769 cycles # 1.841 GHz - 7,838,410,301 instructions # 2.04 insn per cycle - 2.091150296 seconds time elapsed +TOTAL : 1.978326 sec + 3,854,791,882 cycles # 1.944 GHz + 7,838,960,867 instructions # 2.03 insn per cycle + 1.983833070 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4276) (512y: 0) (512z: 2561) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 26b7d791d0..5fa97f6eb1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:19:42 +DATE: 2025-12-07_18:40:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.156027e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.795194e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.935274e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.932204e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782179e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923385e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.491299 sec - 2,134,224,720 cycles # 2.818 GHz - 2,993,931,932 instructions # 1.40 insn per cycle - 0.814346515 seconds time elapsed +TOTAL : 0.488937 sec + 2,197,608,974 cycles # 2.913 GHz + 3,032,081,434 instructions # 1.38 insn per cycle + 0.811485575 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.565640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.664688e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.664688e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.682004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.786723e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.786723e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.173683 sec - 11,879,331,181 cycles # 2.844 GHz - 35,236,712,439 instructions # 2.97 insn per cycle - 4.178908664 seconds time elapsed +TOTAL : 3.995080 sec + 11,889,678,207 cycles # 2.973 GHz + 35,234,738,815 instructions # 2.96 insn per cycle + 4.000645870 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.266171e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.744141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.744141e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.340775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.827598e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.827598e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.079083 sec - 5,991,903,430 cycles # 2.877 GHz - 14,602,254,330 instructions # 2.44 insn per cycle - 2.084327795 seconds time elapsed +TOTAL : 2.050904 sec + 6,008,090,573 cycles # 2.923 GHz + 14,602,481,771 instructions # 2.43 insn per cycle + 2.056440048 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.207154e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.042682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.042682e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.457760e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.339708e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.339708e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.541810 sec - 4,186,740,965 cycles # 2.708 GHz - 8,926,188,902 instructions # 2.13 insn per cycle - 1.547085242 seconds time elapsed +TOTAL : 1.489994 sec + 4,205,471,794 cycles # 2.813 GHz + 8,926,588,092 instructions # 2.12 insn per cycle + 1.495568427 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3572) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.102028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.913223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.913223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.529977e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.427868e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.427868e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.563681 sec - 4,235,267,452 cycles # 2.701 GHz - 8,456,560,522 instructions # 2.00 insn per cycle - 1.569074089 seconds time elapsed +TOTAL : 1.477611 sec + 4,149,298,363 cycles # 2.800 GHz + 8,457,689,045 instructions # 2.04 insn per cycle + 1.482990150 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3298) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.304407e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.741587e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.741587e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.562127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.035184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.035184e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.064360 sec - 3,788,747,014 cycles # 1.832 GHz - 7,722,840,376 instructions # 2.04 insn per cycle - 2.069669389 seconds time elapsed +TOTAL : 1.971720 sec + 3,794,091,581 cycles # 1.920 GHz + 7,720,026,076 instructions # 2.03 insn per cycle + 1.977220825 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3288) (512y: 0) (512z: 2115) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling index 54ccd09765..b2644c85ef 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:41:00 +DATE: 2025-12-07_17:57:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.555626e+06 1 256 -2.986119e+06 2 256 -6.036846e+06 4 256 -1.188714e+07 8 256 -2.177797e+07 16 256 -4.206332e+07 32 256 -5.661642e+07 64 256 -6.199098e+07 128 256 -6.763415e+07 256 256 -7.331358e+07 512 256 -7.450922e+07 1024 256 +1.488450e+06 1 256 +2.910298e+06 2 256 +5.976421e+06 4 256 +1.208638e+07 8 256 +2.607456e+07 16 256 +4.483534e+07 32 256 +5.671638e+07 64 256 +6.292535e+07 128 256 +6.848618e+07 256 256 +7.357181e+07 512 256 +7.439299e+07 1024 256 ### GPU: scaling test 32 -1.688262e+05 1 32 -3.674276e+05 2 32 -6.877986e+05 4 32 -1.577034e+06 8 32 -2.900718e+06 16 32 -6.084626e+06 32 32 -1.103805e+07 64 32 -2.304347e+07 128 32 -4.366714e+07 256 32 -5.801104e+07 512 32 -6.280270e+07 1024 32 -6.781899e+07 2048 32 -7.247457e+07 4096 32 -7.443838e+07 8192 32 +1.948606e+05 1 32 +3.932847e+05 2 32 +8.369678e+05 4 32 +1.587075e+06 8 32 +3.304868e+06 16 32 +6.243446e+06 32 32 +1.191959e+07 64 32 +2.411440e+07 128 32 +4.198442e+07 256 32 +5.445233e+07 512 32 +6.448629e+07 1024 32 +6.809438e+07 2048 32 +7.257321e+07 4096 32 +7.510211e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.683557e+05 1 256 -1.766666e+05 2 256 -1.772916e+05 4 256 +1.745862e+05 1 256 +1.773466e+05 2 256 +1.746947e+05 4 256 ### CPU: scaling test 32 -1.624761e+05 1 32 -1.667961e+05 2 32 -1.691810e+05 4 32 +1.562614e+05 1 32 +1.723223e+05 2 32 +1.661356e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.045208e+05 1 256 -3.168070e+05 2 256 -3.217376e+05 4 256 +2.920055e+05 1 256 +3.092462e+05 2 256 +3.248076e+05 4 256 ### CPU: scaling test 32 -2.400438e+05 1 32 -2.988113e+05 2 32 -3.019623e+05 4 32 +2.913382e+05 1 32 +2.974019e+05 2 32 +3.044307e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.679979e+05 1 256 -5.383388e+05 2 256 -5.290511e+05 4 256 +5.340220e+05 1 256 +5.347219e+05 2 256 +5.181948e+05 4 256 ### CPU: scaling test 32 -4.501210e+05 1 32 -5.408786e+05 2 32 -5.212787e+05 4 32 +5.299856e+05 1 32 +5.407232e+05 2 32 +5.180928e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.337937e+05 1 256 -5.659660e+05 2 256 -5.616905e+05 4 256 +5.751685e+05 1 256 +5.681617e+05 2 256 +5.656303e+05 4 256 ### CPU: scaling test 32 -5.554591e+05 1 32 -5.687726e+05 2 32 -5.722998e+05 4 32 +5.576370e+05 1 32 +5.668834e+05 2 32 +5.531762e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.669688e+05 1 256 -3.628236e+05 2 256 -3.574239e+05 4 256 +3.579268e+05 1 256 +3.491216e+05 2 256 +3.633512e+05 4 256 ### CPU: scaling test 32 -3.591712e+05 1 32 -3.436223e+05 2 32 -3.302689e+05 4 32 +3.591309e+05 1 32 +3.448778e+05 2 32 +3.690761e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 544d45db6c..773ce7d493 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:18:10 +DATE: 2025-12-07_17:35:48 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.769964e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.181272e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.572183e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.911733e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.197101e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.578585e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.539441 sec - 2,308,666,493 cycles # 2.818 GHz - 3,226,425,933 instructions # 1.40 insn per cycle - 0.876647709 seconds time elapsed +TOTAL : 0.531309 sec + 2,298,263,727 cycles # 2.894 GHz + 3,237,712,578 instructions # 1.41 insn per cycle + 0.851772712 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.759806e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.804204e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.804204e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.810213e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.855386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.855386e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.067261 sec - 17,454,635,732 cycles # 2.875 GHz - 46,423,626,762 instructions # 2.66 insn per cycle - 6.073054725 seconds time elapsed +TOTAL : 5.898002 sec + 17,492,263,536 cycles # 2.964 GHz + 46,406,102,353 instructions # 2.65 insn per cycle + 5.903090860 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.147663e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.305031e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.305031e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.249470e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410418e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.441893 sec - 9,972,963,833 cycles # 2.894 GHz - 27,538,315,448 instructions # 2.76 insn per cycle - 3.447650533 seconds time elapsed +TOTAL : 3.331431 sec + 9,987,092,564 cycles # 2.994 GHz + 27,526,459,356 instructions # 2.76 insn per cycle + 3.336604437 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063936228991 +Relative difference = 2.9888358083132774e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.024399e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.421447e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.421447e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.183035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.593016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.593016e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.195598 sec - 6,002,435,023 cycles # 2.728 GHz - 12,431,827,184 instructions # 2.07 insn per cycle - 2.201348309 seconds time elapsed +TOTAL : 2.128343 sec + 5,986,967,079 cycles # 2.808 GHz + 12,419,962,919 instructions # 2.07 insn per cycle + 2.133416645 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.239682e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.660399e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.660399e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.424137e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.862976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.862976e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.110434 sec - 5,712,484,983 cycles # 2.700 GHz - 11,998,977,462 instructions # 2.10 insn per cycle - 2.116158863 seconds time elapsed +TOTAL : 2.037797 sec + 5,730,267,891 cycles # 2.806 GHz + 11,987,111,933 instructions # 2.09 insn per cycle + 2.042972381 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.500878e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.684605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.684605e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.629539e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.821036e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.821036e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.104242 sec - 5,600,150,554 cycles # 1.801 GHz - 7,978,262,251 instructions # 1.42 insn per cycle - 3.109987032 seconds time elapsed +TOTAL : 2.993460 sec + 5,594,985,121 cycles # 1.866 GHz + 7,969,205,749 instructions # 1.42 insn per cycle + 2.998509816 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling index 108784d281..ddcfe3f23b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:55:32 +DATE: 2025-12-07_18:12:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.842927e+05 1 256 -7.220512e+05 2 256 -1.491222e+06 4 256 -2.667848e+06 8 256 -4.492588e+06 16 256 -7.139826e+06 32 256 -9.157999e+06 64 256 -1.073484e+07 128 256 -1.179428e+07 256 256 -1.249669e+07 512 256 -1.288538e+07 1024 256 +3.994651e+05 1 256 +7.985551e+05 2 256 +1.503243e+06 4 256 +2.794013e+06 8 256 +4.595158e+06 16 256 +7.329927e+06 32 256 +9.535755e+06 64 256 +1.089703e+07 128 256 +1.187214e+07 256 256 +1.251805e+07 512 256 +1.286430e+07 1024 256 ### GPU: scaling test 32 -4.771078e+04 1 32 -9.904224e+04 2 32 -1.834573e+05 4 32 -3.665684e+05 8 32 -7.223823e+05 16 32 -1.469468e+06 32 32 -2.777699e+06 64 32 -4.610551e+06 128 32 -7.035262e+06 256 32 -9.216118e+06 512 32 -1.072571e+07 1024 32 -1.171381e+07 2048 32 -1.244431e+07 4096 32 -1.273882e+07 8192 32 +5.064333e+04 1 32 +1.023224e+05 2 32 +2.045232e+05 4 32 +4.191719e+05 8 32 +7.921650e+05 16 32 +1.584828e+06 32 32 +2.806952e+06 64 32 +4.738788e+06 128 32 +7.295032e+06 256 32 +9.453051e+06 512 32 +1.073132e+07 1024 32 +1.182017e+07 2048 32 +1.244007e+07 4096 32 +1.279169e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.731213e+05 1 256 -1.728516e+05 2 256 -1.721045e+05 4 256 +1.681080e+05 1 256 +1.741496e+05 2 256 +1.757422e+05 4 256 ### CPU: scaling test 32 -1.615729e+05 1 32 -1.697199e+05 2 32 -1.614079e+05 4 32 +1.495460e+05 1 32 +1.519544e+05 2 32 +1.661979e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.020824e+05 1 256 -3.069129e+05 2 256 -3.229135e+05 4 256 +2.914055e+05 1 256 +2.902397e+05 2 256 +3.149646e+05 4 256 ### CPU: scaling test 32 -3.068132e+05 1 32 -3.048781e+05 2 32 -3.056454e+05 4 32 +2.842423e+05 1 32 +2.699909e+05 2 32 +2.863010e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.343999e+05 1 256 -5.367208e+05 2 256 -5.297172e+05 4 256 +4.910320e+05 1 256 +4.984414e+05 2 256 +4.996370e+05 4 256 ### CPU: scaling test 32 -5.308120e+05 1 32 -5.388158e+05 2 32 -5.419802e+05 4 32 +5.310410e+05 1 32 +5.392516e+05 2 32 +5.235196e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.825073e+05 1 256 -5.664394e+05 2 256 -5.715909e+05 4 256 +5.139386e+05 1 256 +5.418827e+05 2 256 +5.246359e+05 4 256 ### CPU: scaling test 32 -5.596656e+05 1 32 -5.686160e+05 2 32 -5.559851e+05 4 32 +5.575204e+05 1 32 +5.667378e+05 2 32 +5.304754e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.589260e+05 1 256 -3.525435e+05 2 256 -3.573650e+05 4 256 +3.659077e+05 1 256 +3.601791e+05 2 256 +3.614495e+05 4 256 ### CPU: scaling test 32 -3.610027e+05 1 32 -3.443008e+05 2 32 -3.569646e+05 4 32 +3.221682e+05 1 32 +3.610332e+05 2 32 +3.522910e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt index 7312e696ce..a717de721d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:51:10 +DATE: 2025-12-07_18:07:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.104417e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.285432e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.297689e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.114953e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.282438e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.293394e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.279377 sec - 4,758,540,406 cycles # 2.854 GHz - 6,643,646,071 instructions # 1.40 insn per cycle - 1.727175074 seconds time elapsed +TOTAL : 1.255727 sec + 4,783,213,337 cycles # 2.931 GHz + 6,701,232,304 instructions # 1.40 insn per cycle + 1.690871905 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.760176e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.804148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.804148e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.807876e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.852801e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852801e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.064955 sec - 17,456,010,031 cycles # 2.876 GHz - 46,423,917,890 instructions # 2.66 insn per cycle - 6.070556221 seconds time elapsed +TOTAL : 5.906048 sec + 17,497,853,051 cycles # 2.961 GHz + 46,405,528,530 instructions # 2.65 insn per cycle + 5.911189624 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.112364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.267713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.267713e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.192409e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.348639e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.348639e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.477891 sec - 9,968,942,008 cycles # 2.863 GHz - 27,538,128,939 instructions # 2.76 insn per cycle - 3.483544020 seconds time elapsed +TOTAL : 3.390421 sec + 9,982,101,191 cycles # 2.941 GHz + 27,526,487,871 instructions # 2.76 insn per cycle + 3.395628547 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063936228991 +Relative difference = 2.9888358083132774e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.028981e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.424760e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.424760e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.157257e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.560566e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.560566e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.192400 sec - 5,973,164,521 cycles # 2.719 GHz - 12,431,134,039 instructions # 2.08 insn per cycle - 2.197968192 seconds time elapsed +TOTAL : 2.137550 sec + 5,981,581,971 cycles # 2.793 GHz + 12,419,758,664 instructions # 2.08 insn per cycle + 2.142689284 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.257840e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.686842e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.686842e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.424839e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.863995e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.863995e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.101990 sec - 5,696,565,349 cycles # 2.704 GHz - 11,998,610,945 instructions # 2.11 insn per cycle - 2.107441314 seconds time elapsed +TOTAL : 2.037455 sec + 5,697,044,338 cycles # 2.790 GHz + 11,984,752,841 instructions # 2.10 insn per cycle + 2.042543144 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.469903e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.652910e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.652910e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.710127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.910746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.910746e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.130516 sec - 5,582,204,405 cycles # 1.781 GHz - 7,977,597,583 instructions # 1.43 insn per cycle - 3.135909354 seconds time elapsed +TOTAL : 2.931801 sec + 5,579,762,643 cycles # 1.901 GHz + 7,968,608,858 instructions # 1.43 insn per cycle + 2.936965900 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt index a27304f7a2..b5a7ff61d7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:49:40 +DATE: 2025-12-07_19:15:51 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.756606e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.155088e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.561577e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.739849e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.159759e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.564900e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.537651 sec - 2,186,941,067 cycles # 2.809 GHz - 3,125,534,216 instructions # 1.43 insn per cycle - 0.834390897 seconds time elapsed +TOTAL : 0.535139 sec + 2,180,547,931 cycles # 2.822 GHz + 3,119,980,193 instructions # 1.43 insn per cycle + 0.829946527 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.767944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.812249e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.812249e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.823561e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.869227e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.869227e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.039437 sec - 17,472,986,286 cycles # 2.891 GHz - 46,424,951,460 instructions # 2.66 insn per cycle - 6.045113130 seconds time elapsed +TOTAL : 5.855629 sec + 17,490,738,032 cycles # 2.985 GHz + 46,404,552,509 instructions # 2.65 insn per cycle + 5.860912027 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.115406e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.269058e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.269058e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.216476e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.376391e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.376391e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.475319 sec - 9,963,493,199 cycles # 2.863 GHz - 27,538,476,105 instructions # 2.76 insn per cycle - 3.481071152 seconds time elapsed +TOTAL : 3.365600 sec + 9,984,795,479 cycles # 2.963 GHz + 27,525,821,384 instructions # 2.76 insn per cycle + 3.370882147 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063936228991 +Relative difference = 2.9888358083132774e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.946610e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.336487e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.336487e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.136370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.548164e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.548164e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.229478 sec - 5,990,602,521 cycles # 2.681 GHz - 12,432,421,413 instructions # 2.08 insn per cycle - 2.235415428 seconds time elapsed +TOTAL : 2.148219 sec + 5,987,246,469 cycles # 2.781 GHz + 12,419,794,182 instructions # 2.07 insn per cycle + 2.153459784 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.285571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.719782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.719782e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.307210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.742007e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.742007e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.092266 sec - 5,708,527,225 cycles # 2.722 GHz - 11,999,256,931 instructions # 2.10 insn per cycle - 2.098089382 seconds time elapsed +TOTAL : 2.081707 sec + 5,726,046,295 cycles # 2.745 GHz + 11,987,951,189 instructions # 2.09 insn per cycle + 2.087180766 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.527493e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.713588e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.713588e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.706776e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.906239e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.906239e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.081621 sec - 5,593,729,597 cycles # 1.813 GHz - 7,978,349,260 instructions # 1.43 insn per cycle - 3.087480023 seconds time elapsed +TOTAL : 2.933414 sec + 5,600,024,888 cycles # 1.906 GHz + 7,968,336,026 instructions # 1.42 insn per cycle + 2.938694844 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 1465355626..b6783e979d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:18:40 +DATE: 2025-12-07_17:36:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.777084e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.077254e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.446466e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.875815e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.093264e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.456173e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.540754 sec - 2,303,579,994 cycles # 2.845 GHz - 3,194,596,199 instructions # 1.39 insn per cycle - 0.867263238 seconds time elapsed +TOTAL : 0.531400 sec + 2,263,146,986 cycles # 2.849 GHz + 3,218,371,825 instructions # 1.42 insn per cycle + 0.851378195 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.824688e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.871754e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.871754e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.863316e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.910834e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.910834e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.855357 sec - 17,037,217,478 cycles # 2.907 GHz - 45,397,533,623 instructions # 2.66 insn per cycle - 5.861206077 seconds time elapsed +TOTAL : 5.731496 sec + 17,081,156,993 cycles # 2.978 GHz + 45,381,356,144 instructions # 2.66 insn per cycle + 5.736596917 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.237044e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404010e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.312005e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.479348e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.479348e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.349468 sec - 9,646,439,674 cycles # 2.877 GHz - 26,137,505,372 instructions # 2.71 insn per cycle - 3.359990731 seconds time elapsed +TOTAL : 3.271075 sec + 9,656,313,890 cycles # 2.949 GHz + 26,124,529,827 instructions # 2.71 insn per cycle + 3.276134101 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2348) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063936228991 +Relative difference = 2.9888358083132774e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.466137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.774981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.774981e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.616867e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.937524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.937524e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.456437 sec - 6,697,050,662 cycles # 2.721 GHz - 13,944,204,689 instructions # 2.08 insn per cycle - 2.462051029 seconds time elapsed +TOTAL : 2.375344 sec + 6,689,119,996 cycles # 2.811 GHz + 13,931,643,085 instructions # 2.08 insn per cycle + 2.380451278 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2872) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.691262e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.027361e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.027361e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.845463e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.195359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.195359e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.343988 sec - 6,390,605,834 cycles # 2.721 GHz - 13,479,985,492 instructions # 2.11 insn per cycle - 2.349738024 seconds time elapsed +TOTAL : 2.268504 sec + 6,402,474,381 cycles # 2.818 GHz + 13,468,504,372 instructions # 2.10 insn per cycle + 2.273684492 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.551855e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.739422e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.739422e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.735584e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.936192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.936192e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.060308 sec - 5,571,902,780 cycles # 1.818 GHz - 9,121,747,396 instructions # 1.64 insn per cycle - 3.066113600 seconds time elapsed +TOTAL : 2.911210 sec + 5,559,382,718 cycles # 1.907 GHz + 9,112,349,648 instructions # 1.64 insn per cycle + 2.916297777 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2028) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling index 13f478253e..44633d6e41 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:41:41 +DATE: 2025-12-07_17:58:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -9.342009e+05 1 256 -1.901727e+06 2 256 -3.513575e+06 4 256 -6.551587e+06 8 256 -9.027157e+06 16 256 -1.070472e+07 32 256 -1.211534e+07 64 256 -1.306873e+07 128 256 -1.345611e+07 256 256 -1.354148e+07 512 256 -1.365009e+07 1024 256 +8.975150e+05 1 256 +1.997979e+06 2 256 +3.665967e+06 4 256 +6.812859e+06 8 256 +9.175689e+06 16 256 +1.081020e+07 32 256 +1.214817e+07 64 256 +1.288360e+07 128 256 +1.321911e+07 256 256 +1.355652e+07 512 256 +1.369836e+07 1024 256 ### GPU: scaling test 32 -1.205755e+05 1 32 -2.514606e+05 2 32 -5.001172e+05 4 32 -9.511001e+05 8 32 -1.851142e+06 16 32 -3.545547e+06 32 32 -6.694933e+06 64 32 -9.515800e+06 128 32 -1.033055e+07 256 32 -1.109138e+07 512 32 -1.156765e+07 1024 32 -1.192504e+07 2048 32 -1.207986e+07 4096 32 -1.213861e+07 8192 32 +1.218078e+05 1 32 +2.514221e+05 2 32 +4.930093e+05 4 32 +9.653711e+05 8 32 +1.881087e+06 16 32 +3.741446e+06 32 32 +6.446475e+06 64 32 +9.550212e+06 128 32 +1.039591e+07 256 32 +1.110598e+07 512 32 +1.169117e+07 1024 32 +1.191687e+07 2048 32 +1.206845e+07 4096 32 +1.216543e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.335000e+04 1 256 -2.360867e+04 2 256 -2.368335e+04 4 256 +2.310789e+04 1 256 +2.341569e+04 2 256 +2.377355e+04 4 256 ### CPU: scaling test 32 -2.236539e+04 1 32 -2.311725e+04 2 32 -2.306838e+04 4 32 +2.235361e+04 1 32 +2.254575e+04 2 32 +2.159371e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.370978e+04 1 256 -4.405634e+04 2 256 -4.456211e+04 4 256 +4.401740e+04 1 256 +4.340482e+04 2 256 +4.458195e+04 4 256 ### CPU: scaling test 32 -3.836659e+04 1 32 -4.179709e+04 2 32 -4.369754e+04 4 32 +4.065655e+04 1 32 +4.309477e+04 2 32 +4.325271e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.926025e+04 1 256 -8.558488e+04 2 256 -8.539748e+04 4 256 +8.814435e+04 1 256 +8.149366e+04 2 256 +9.032743e+04 4 256 ### CPU: scaling test 32 -8.398708e+04 1 32 -8.906950e+04 2 32 -8.745810e+04 4 32 +8.869033e+04 1 32 +8.610414e+04 2 32 +8.548579e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.556008e+04 1 256 -9.646045e+04 2 256 -9.528700e+04 4 256 +9.629545e+04 1 256 +9.649306e+04 2 256 +9.664290e+04 4 256 ### CPU: scaling test 32 -8.322886e+04 1 32 -8.916295e+04 2 32 -9.000274e+04 4 32 +9.342793e+04 1 32 +9.628593e+04 2 32 +9.663608e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.425669e+04 1 256 -6.732158e+04 2 256 -6.696446e+04 4 256 +6.754511e+04 1 256 +6.766985e+04 2 256 +6.798669e+04 4 256 ### CPU: scaling test 32 -6.780265e+04 1 32 -6.786649e+04 2 32 -6.753983e+04 4 32 +6.875127e+04 1 32 +6.690998e+04 2 32 +6.542365e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 53423221d6..080a6c0715 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:20:08 +DATE: 2025-12-07_17:37:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.590985e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195514e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.215933e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.005749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.211141e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.228085e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.475543 sec - 2,072,965,387 cycles # 2.836 GHz - 2,812,513,904 instructions # 1.36 insn per cycle - 0.789686961 seconds time elapsed +TOTAL : 0.465449 sec + 2,041,375,001 cycles # 2.875 GHz + 2,814,351,977 instructions # 1.38 insn per cycle + 0.766690035 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.134307e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.362144e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.374708e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.143121e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.368948e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.381954e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.566501 sec - 2,402,738,046 cycles # 2.849 GHz - 3,415,144,104 instructions # 1.42 insn per cycle - 0.902303425 seconds time elapsed +TOTAL : 0.556024 sec + 2,417,834,721 cycles # 2.914 GHz + 3,478,069,144 instructions # 1.44 insn per cycle + 0.889937049 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.360536e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372172e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372172e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.397510e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.408971e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.408971e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.962552 sec - 20,052,897,229 cycles # 2.879 GHz - 60,517,484,268 instructions # 3.02 insn per cycle - 6.966626285 seconds time elapsed +TOTAL : 6.854308 sec + 20,326,219,770 cycles # 2.964 GHz + 60,622,631,710 instructions # 2.98 insn per cycle + 6.858205604 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.457200e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.498681e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.498681e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.602607e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.644172e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.644172e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.696167 sec - 10,707,329,548 cycles # 2.895 GHz - 31,170,881,652 instructions # 2.91 insn per cycle - 3.700212507 seconds time elapsed +TOTAL : 3.578959 sec + 10,703,050,389 cycles # 2.988 GHz + 31,170,899,966 instructions # 2.91 insn per cycle + 3.582863003 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.870920e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.029877e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.029877e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.161029e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.324341e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.324341e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.867542 sec - 5,077,134,246 cycles # 2.714 GHz - 11,510,163,524 instructions # 2.27 insn per cycle - 1.871736808 seconds time elapsed +TOTAL : 1.807642 sec + 5,064,337,829 cycles # 2.797 GHz + 11,511,166,873 instructions # 2.27 insn per cycle + 1.811664141 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.650179e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.846221e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.846221e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.951094e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.014031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.014031e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.718355 sec - 4,666,627,650 cycles # 2.711 GHz - 10,813,430,115 instructions # 2.32 insn per cycle - 1.722417533 seconds time elapsed +TOTAL : 1.665546 sec + 4,657,479,591 cycles # 2.791 GHz + 10,813,305,377 instructions # 2.32 insn per cycle + 1.669408982 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.895380e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.991775e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.991775e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.219925e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.319679e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.319679e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.398459 sec - 4,202,110,606 cycles # 1.750 GHz - 6,028,015,369 instructions # 1.43 insn per cycle - 2.402798408 seconds time elapsed +TOTAL : 2.290920 sec + 4,196,326,815 cycles # 1.830 GHz + 6,028,223,308 instructions # 1.44 insn per cycle + 2.294781353 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling index 88f80f3081..12174c306f 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:56:53 +DATE: 2025-12-07_18:13:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.480668e+05 1 256 -6.757720e+05 2 256 -1.342710e+06 4 256 -1.961408e+06 8 256 -2.863939e+06 16 256 -3.692840e+06 32 256 -4.108363e+06 64 256 -4.389055e+06 128 256 -4.590159e+06 256 256 -4.677980e+06 512 256 -4.719776e+06 1024 256 +3.856543e+05 1 256 +7.450947e+05 2 256 +1.347025e+06 4 256 +2.044406e+06 8 256 +3.017256e+06 16 256 +3.706346e+06 32 256 +4.125219e+06 64 256 +4.431098e+06 128 256 +4.607977e+06 256 256 +4.684190e+06 512 256 +4.702917e+06 1024 256 ### GPU: scaling test 32 -5.093214e+04 1 32 -9.453332e+04 2 32 -1.923664e+05 4 32 -3.828673e+05 8 32 -7.100352e+05 16 32 -1.286052e+06 32 32 -2.074968e+06 64 32 -2.993421e+06 128 32 -3.590529e+06 256 32 -4.025040e+06 512 32 -4.233186e+06 1024 32 -4.428606e+06 2048 32 -4.494795e+06 4096 32 -4.506986e+06 8192 32 +5.151751e+04 1 32 +1.100988e+05 2 32 +2.022433e+05 4 32 +3.983369e+05 8 32 +7.462001e+05 16 32 +1.353223e+06 32 32 +2.111765e+06 64 32 +3.048016e+06 128 32 +3.642056e+06 256 32 +4.056357e+06 512 32 +4.287933e+06 1024 32 +4.443250e+06 2048 32 +4.496016e+06 4096 32 +4.509793e+06 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.283518e+04 1 256 -2.360000e+04 2 256 -2.368362e+04 4 256 +2.294064e+04 1 256 +2.338910e+04 2 256 +2.391382e+04 4 256 ### CPU: scaling test 32 -2.195483e+04 1 32 -2.267087e+04 2 32 -2.328199e+04 4 32 +2.234815e+04 1 32 +2.275220e+04 2 32 +2.293295e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.369761e+04 1 256 -4.426783e+04 2 256 -4.443961e+04 4 256 +4.387884e+04 1 256 +4.418799e+04 2 256 +4.503427e+04 4 256 ### CPU: scaling test 32 -4.205894e+04 1 32 -4.154644e+04 2 32 -4.180789e+04 4 32 +4.064318e+04 1 32 +4.270322e+04 2 32 +4.381229e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.635620e+04 1 256 -8.373531e+04 2 256 -8.654539e+04 4 256 +8.812356e+04 1 256 +8.743199e+04 2 256 +8.907749e+04 4 256 ### CPU: scaling test 32 -8.995865e+04 1 32 -8.789712e+04 2 32 -8.901054e+04 4 32 +8.992174e+04 1 32 +8.926953e+04 2 32 +8.931674e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.711265e+04 1 256 -9.722643e+04 2 256 -9.347803e+04 4 256 +9.649134e+04 1 256 +9.456629e+04 2 256 +9.716425e+04 4 256 ### CPU: scaling test 32 -9.518909e+04 1 32 -9.721140e+04 2 32 -9.724959e+04 4 32 +9.802210e+04 1 32 +9.584555e+04 2 32 +9.693665e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.678497e+04 1 256 -6.627189e+04 2 256 -6.803332e+04 4 256 +6.830355e+04 1 256 +6.824017e+04 2 256 +6.827493e+04 4 256 ### CPU: scaling test 32 -6.749432e+04 1 32 -6.701283e+04 2 32 -6.598727e+04 4 32 +6.875629e+04 1 32 +6.726964e+04 2 32 +6.787696e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 5ea3c579b2..736ba23edb 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_16:29:39 +DATE: 2025-12-07_18:49:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.808698e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.065448e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.065448e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.890518e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.085366e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.085366e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.500490 sec - 2,152,747,639 cycles # 2.835 GHz - 3,089,120,012 instructions # 1.43 insn per cycle - 0.817131761 seconds time elapsed +TOTAL : 0.493484 sec + 2,121,515,835 cycles # 2.879 GHz + 3,095,078,878 instructions # 1.46 insn per cycle + 0.793739343 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -82,14 +76,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.720979e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.001076e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.001076e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.715689e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.734995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.734995e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.786088 sec - 3,079,796,138 cycles # 2.856 GHz - 4,693,820,986 instructions # 1.52 insn per cycle - 1.137301736 seconds time elapsed +TOTAL : 0.786349 sec + 3,117,308,966 cycles # 2.903 GHz + 4,764,876,122 instructions # 1.53 insn per cycle + 1.135777371 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.340726e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.352294e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.352294e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.384282e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.396010e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.396010e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.027688 sec - 20,121,022,602 cycles # 2.862 GHz - 60,520,827,051 instructions # 3.01 insn per cycle - 7.031786887 seconds time elapsed +TOTAL : 6.900077 sec + 20,436,318,076 cycles # 2.961 GHz + 60,632,224,107 instructions # 2.97 insn per cycle + 6.904101996 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.433303e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.475603e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.475603e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.533552e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.575399e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.575399e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.724019 sec - 10,754,955,259 cycles # 2.886 GHz - 31,220,075,253 instructions # 2.90 insn per cycle - 3.728441609 seconds time elapsed +TOTAL : 3.641607 sec + 10,771,255,218 cycles # 2.955 GHz + 31,222,461,894 instructions # 2.90 insn per cycle + 3.645784685 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.799230e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.961399e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.961399e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.992063e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.153930e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.153930e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.890149 sec - 5,120,442,526 cycles # 2.704 GHz - 11,558,215,171 instructions # 2.26 insn per cycle - 1.894456584 seconds time elapsed +TOTAL : 1.849547 sec + 5,112,967,467 cycles # 2.760 GHz + 11,562,288,585 instructions # 2.26 insn per cycle + 1.853634874 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.595269e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.785975e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.785975e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.783173e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.971155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.971155e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.735302 sec - 4,701,578,061 cycles # 2.704 GHz - 10,861,447,059 instructions # 2.31 insn per cycle - 1.739681098 seconds time elapsed +TOTAL : 1.701199 sec + 4,700,944,477 cycles # 2.758 GHz + 10,863,355,860 instructions # 2.31 insn per cycle + 1.705186980 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -218,14 +212,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.737162e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.834485e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.834485e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.848573e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.946947e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.946947e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.462185 sec - 4,238,690,147 cycles # 1.719 GHz - 6,064,850,138 instructions # 1.43 insn per cycle - 2.466509903 seconds time elapsed +TOTAL : 2.423070 sec + 4,246,448,381 cycles # 1.750 GHz + 6,068,588,150 instructions # 1.43 insn per cycle + 2.427354578 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 2fc1d7dc04..92943ea8c0 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:20:41 +DATE: 2025-12-07_17:38:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.786288e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.203485e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.221467e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.000396e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204429e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221537e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.470896 sec - 2,028,123,419 cycles # 2.825 GHz - 2,812,031,573 instructions # 1.39 insn per cycle - 0.775558684 seconds time elapsed +TOTAL : 0.464004 sec + 2,060,831,954 cycles # 2.910 GHz + 2,844,573,903 instructions # 1.38 insn per cycle + 0.765037285 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.146437e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.383510e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.397548e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.149784e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379020e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.392178e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.569288 sec - 2,428,652,206 cycles # 2.852 GHz - 3,427,874,591 instructions # 1.41 insn per cycle - 0.912714324 seconds time elapsed +TOTAL : 0.556348 sec + 2,398,360,199 cycles # 2.882 GHz + 3,492,773,628 instructions # 1.46 insn per cycle + 0.890596918 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.386609e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.398461e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.398461e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.421783e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.433450e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.433450e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.886307 sec - 19,965,917,518 cycles # 2.898 GHz - 60,201,240,687 instructions # 3.02 insn per cycle - 6.890252778 seconds time elapsed +TOTAL : 6.785360 sec + 20,212,984,696 cycles # 2.978 GHz + 60,305,695,465 instructions # 2.98 insn per cycle + 6.789396492 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.533737e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.576916e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.576916e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.657602e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.699453e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.699453e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.633851 sec - 10,579,683,505 cycles # 2.909 GHz - 30,847,655,837 instructions # 2.92 insn per cycle - 3.638097883 seconds time elapsed +TOTAL : 3.536481 sec + 10,572,268,137 cycles # 2.987 GHz + 30,847,127,588 instructions # 2.92 insn per cycle + 3.540428550 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.536026e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.682366e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.682366e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.827095e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.973996e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.973996e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.939515 sec - 5,249,266,634 cycles # 2.702 GHz - 11,982,858,846 instructions # 2.28 insn per cycle - 1.943675108 seconds time elapsed +TOTAL : 1.875007 sec + 5,247,107,125 cycles # 2.794 GHz + 11,983,514,854 instructions # 2.28 insn per cycle + 1.878958816 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4772) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.187873e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.358429e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.358429e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.574972e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.752402e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.752402e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.803322 sec - 4,846,320,602 cycles # 2.683 GHz - 11,310,325,393 instructions # 2.33 insn per cycle - 1.807176987 seconds time elapsed +TOTAL : 1.730427 sec + 4,842,241,429 cycles # 2.793 GHz + 11,310,123,297 instructions # 2.34 insn per cycle + 1.734404885 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4455) (512y: 231) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.783861e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.878450e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.878450e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.179100e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.278487e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.278487e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.437468 sec - 4,222,471,079 cycles # 1.730 GHz - 6,310,155,112 instructions # 1.49 insn per cycle - 2.441536708 seconds time elapsed +TOTAL : 2.303819 sec + 4,212,382,949 cycles # 1.826 GHz + 6,309,769,132 instructions # 1.50 insn per cycle + 2.307762219 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1619) (512y: 119) (512z: 3648) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling index 66fa52db02..0dfaf1f344 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:42:24 +DATE: 2025-12-07_17:59:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.020563e+06 1 256 -1.907125e+06 2 256 -3.779714e+06 4 256 -7.211953e+06 8 256 -1.376478e+07 16 256 -2.148631e+07 32 256 -2.475235e+07 64 256 -2.658152e+07 128 256 -2.709334e+07 256 256 -2.813503e+07 512 256 -2.865513e+07 1024 256 +1.004213e+06 1 256 +2.007544e+06 2 256 +3.995458e+06 4 256 +7.131267e+06 8 256 +1.437052e+07 16 256 +2.111416e+07 32 256 +2.511863e+07 64 256 +2.685402e+07 128 256 +2.736777e+07 256 256 +2.819329e+07 512 256 +2.939376e+07 1024 256 ### GPU: scaling test 32 -1.249239e+05 1 32 -2.576023e+05 2 32 -5.236416e+05 4 32 -9.816703e+05 8 32 -1.909308e+06 16 32 -3.564529e+06 32 32 -7.104303e+06 64 32 -1.425315e+07 128 32 -2.099087e+07 256 32 -2.446553e+07 512 32 -2.604809e+07 1024 32 -2.693465e+07 2048 32 -2.780197e+07 4096 32 -2.832618e+07 8192 32 +1.246082e+05 1 32 +2.611520e+05 2 32 +5.335623e+05 4 32 +9.550172e+05 8 32 +2.001259e+06 16 32 +3.773849e+06 32 32 +7.714852e+06 64 32 +1.450908e+07 128 32 +2.080281e+07 256 32 +2.458775e+07 512 32 +2.612411e+07 1024 32 +2.657863e+07 2048 32 +2.749900e+07 4096 32 +2.870639e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.475086e+04 1 256 -2.477196e+04 2 256 -2.498053e+04 4 256 +2.455266e+04 1 256 +2.506583e+04 2 256 +2.478268e+04 4 256 ### CPU: scaling test 32 -2.306794e+04 1 32 -2.472476e+04 2 32 -2.481117e+04 4 32 +2.471348e+04 1 32 +2.463806e+04 2 32 +2.468742e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.800127e+04 1 256 -7.895709e+04 2 256 -7.905572e+04 4 256 +7.668604e+04 1 256 +7.808799e+04 2 256 +7.912285e+04 4 256 ### CPU: scaling test 32 -7.190850e+04 1 32 -7.327190e+04 2 32 -7.683355e+04 4 32 +7.710546e+04 1 32 +7.362915e+04 2 32 +7.667343e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.743170e+05 1 256 -1.714585e+05 2 256 -1.739702e+05 4 256 +1.739572e+05 1 256 +1.737292e+05 2 256 +1.743767e+05 4 256 ### CPU: scaling test 32 -1.605789e+05 1 32 -1.673207e+05 2 32 -1.747798e+05 4 32 +1.685275e+05 1 32 +1.709169e+05 2 32 +1.596868e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.847081e+05 1 256 -1.886928e+05 2 256 -1.844591e+05 4 256 +1.880412e+05 1 256 +1.866140e+05 2 256 +1.888751e+05 4 256 ### CPU: scaling test 32 -1.678389e+05 1 32 -1.901615e+05 2 32 -1.805064e+05 4 32 +1.890806e+05 1 32 +1.901480e+05 2 32 +1.853050e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.398580e+05 1 256 -1.377336e+05 2 256 -1.394286e+05 4 256 +1.403371e+05 1 256 +1.404862e+05 2 256 +1.395936e+05 4 256 ### CPU: scaling test 32 -1.350638e+05 1 32 -1.419406e+05 2 32 -1.392215e+05 4 32 +1.408668e+05 1 32 +1.349164e+05 2 32 +1.409693e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 359e7877d9..c1cb258873 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:22:22 +DATE: 2025-12-07_17:39:47 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.012111e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.590020e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.652888e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.084974e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.606396e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.663535e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 -TOTAL : 0.461660 sec - 2,024,209,134 cycles # 2.804 GHz - 2,785,160,230 instructions # 1.38 insn per cycle - 0.779091198 seconds time elapsed +TOTAL : 0.454854 sec + 2,026,129,591 cycles # 2.901 GHz + 2,798,511,840 instructions # 1.38 insn per cycle + 0.756318193 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.304364e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.823335e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.855285e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.339706e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.827574e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.858034e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.506727 sec - 2,201,759,148 cycles # 2.852 GHz - 3,068,173,195 instructions # 1.39 insn per cycle - 0.828420263 seconds time elapsed +TOTAL : 0.503109 sec + 2,186,177,889 cycles # 2.879 GHz + 3,078,042,558 instructions # 1.41 insn per cycle + 0.818345172 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.501069e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.514090e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.514090e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.548009e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.561027e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.561027e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.569879 sec - 19,152,579,978 cycles # 2.914 GHz - 59,680,745,465 instructions # 3.12 insn per cycle - 6.573833440 seconds time elapsed +TOTAL : 6.448474 sec + 19,278,332,121 cycles # 2.988 GHz + 59,837,769,632 instructions # 3.10 insn per cycle + 6.452274875 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +Avg ME (F77/C++) = 1.4129949097065833 +Relative difference = 6.390214879988402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.920524e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.053952e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.053952e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.132267e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.267758e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.267758e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.086277 sec - 6,057,068,110 cycles # 2.899 GHz - 17,105,898,955 instructions # 2.82 insn per cycle - 2.090214636 seconds time elapsed +TOTAL : 2.032633 sec + 6,055,978,409 cycles # 2.975 GHz + 17,105,926,620 instructions # 2.82 insn per cycle + 2.036392907 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.680104e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.737565e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.737565e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.754653e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816091e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816091e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.993425 sec - 2,677,007,034 cycles # 2.687 GHz - 6,240,512,600 instructions # 2.33 insn per cycle - 0.997226702 seconds time elapsed +TOTAL : 0.951673 sec + 2,670,608,131 cycles # 2.798 GHz + 6,241,050,688 instructions # 2.34 insn per cycle + 0.955410948 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.843149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.912179e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912179e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.884297e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.954560e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.954560e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.907079 sec - 2,478,306,991 cycles # 2.723 GHz - 5,867,870,372 instructions # 2.37 insn per cycle - 0.910927509 seconds time elapsed +TOTAL : 0.886654 sec + 2,476,592,350 cycles # 2.784 GHz + 5,867,598,490 instructions # 2.37 insn per cycle + 0.890401868 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.382994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.423338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.423338e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.456046e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.499309e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.499309e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.206279 sec - 2,116,978,988 cycles # 1.750 GHz - 3,424,879,930 instructions # 1.62 insn per cycle - 1.210305817 seconds time elapsed +TOTAL : 1.144481 sec + 2,111,635,080 cycles # 1.841 GHz + 3,424,968,205 instructions # 1.62 insn per cycle + 1.148206997 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling index 03b7dc0471..f33b203849 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:58:16 +DATE: 2025-12-07_18:14:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.727486e+05 1 256 -7.374228e+05 2 256 -1.359495e+06 4 256 -2.228941e+06 8 256 -3.376485e+06 16 256 -4.469020e+06 32 256 -5.249324e+06 64 256 -5.869764e+06 128 256 -6.094954e+06 256 256 -6.260097e+06 512 256 -6.357949e+06 1024 256 +4.000388e+05 1 256 +7.119041e+05 2 256 +1.450443e+06 4 256 +2.249027e+06 8 256 +3.544142e+06 16 256 +4.521578e+06 32 256 +5.318710e+06 64 256 +5.897401e+06 128 256 +6.115624e+06 256 256 +6.262611e+06 512 256 +6.345780e+06 1024 256 ### GPU: scaling test 32 -5.112115e+04 1 32 -9.374377e+04 2 32 -1.887009e+05 4 32 -3.960359e+05 8 32 -7.300603e+05 16 32 -1.308116e+06 32 32 -1.995847e+06 64 32 -3.417585e+06 128 32 -4.455777e+06 256 32 -5.284200e+06 512 32 -5.826269e+06 1024 32 -6.082445e+06 2048 32 -6.255269e+06 4096 32 -6.329872e+06 8192 32 +5.632504e+04 1 32 +1.071483e+05 2 32 +2.112100e+05 4 32 +4.008154e+05 8 32 +7.995902e+05 16 32 +1.419366e+06 32 32 +2.233452e+06 64 32 +3.447292e+06 128 32 +4.534036e+06 256 32 +5.290783e+06 512 32 +5.895573e+06 1024 32 +6.101833e+06 2048 32 +6.269906e+06 4096 32 +6.335513e+06 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.438060e+04 1 256 -2.470219e+04 2 256 -2.476066e+04 4 256 +2.379188e+04 1 256 +2.495384e+04 2 256 +2.504893e+04 4 256 ### CPU: scaling test 32 -2.461887e+04 1 32 -2.470134e+04 2 32 -2.410740e+04 4 32 +2.343296e+04 1 32 +2.466768e+04 2 32 +2.449916e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.129456e+04 1 256 -7.835869e+04 2 256 -7.787307e+04 4 256 +7.601268e+04 1 256 +7.864726e+04 2 256 +7.942380e+04 4 256 ### CPU: scaling test 32 -6.724611e+04 1 32 -6.848385e+04 2 32 -7.303564e+04 4 32 +7.435083e+04 1 32 +7.443548e+04 2 32 +7.720690e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.606597e+05 1 256 -1.630584e+05 2 256 -1.606208e+05 4 256 +1.739903e+05 1 256 +1.733013e+05 2 256 +1.744532e+05 4 256 ### CPU: scaling test 32 -1.551508e+05 1 32 -1.588322e+05 2 32 -1.636465e+05 4 32 +1.631205e+05 1 32 +1.686452e+05 2 32 +1.719369e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.742285e+05 1 256 -1.758288e+05 2 256 -1.738872e+05 4 256 +1.873706e+05 1 256 +1.892253e+05 2 256 +1.873775e+05 4 256 ### CPU: scaling test 32 -1.750902e+05 1 32 -1.718448e+05 2 32 -1.870659e+05 4 32 +1.795040e+05 1 32 +1.837623e+05 2 32 +1.843799e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.405438e+05 1 256 -1.389272e+05 2 256 -1.380473e+05 4 256 +1.400111e+05 1 256 +1.397639e+05 2 256 +1.389151e+05 4 256 ### CPU: scaling test 32 -1.416732e+05 1 32 -1.383910e+05 2 32 -1.393492e+05 4 32 +1.182404e+05 1 32 +1.392727e+05 2 32 +1.389429e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index b34d8177c5..ac41028049 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_16:30:12 +DATE: 2025-12-07_18:49:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.563182e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.822216e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.822216e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.679779e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.828784e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.828784e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009070e+02 +- 5.002294e+01 ) GeV^-2 -TOTAL : 0.474333 sec - 2,020,095,914 cycles # 2.815 GHz - 2,863,432,755 instructions # 1.42 insn per cycle - 0.775295436 seconds time elapsed +TOTAL : 0.468428 sec + 2,039,217,988 cycles # 2.870 GHz + 2,952,718,826 instructions # 1.45 insn per cycle + 0.767771487 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -82,14 +76,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.400607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.017646e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.017646e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.395155e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.968016e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.968016e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737499e+02 +- 4.776369e+02 ) GeV^-2 -TOTAL : 0.650114 sec - 2,601,943,365 cycles # 2.840 GHz - 3,913,396,482 instructions # 1.50 insn per cycle - 0.976170377 seconds time elapsed +TOTAL : 0.645433 sec + 2,642,500,096 cycles # 2.905 GHz + 3,946,979,262 instructions # 1.49 insn per cycle + 0.969619403 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.486527e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.499486e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.499486e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.514110e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.527005e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.527005e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.611886 sec - 19,177,870,695 cycles # 2.899 GHz - 59,684,285,229 instructions # 3.11 insn per cycle - 6.615966746 seconds time elapsed +TOTAL : 6.538849 sec + 19,305,887,589 cycles # 2.951 GHz + 59,841,359,007 instructions # 3.10 insn per cycle + 6.542674470 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -128,8 +122,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +Avg ME (F77/C++) = 1.4129949097065833 +Relative difference = 6.390214879988402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.840675e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.974875e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.974875e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.980797e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.117398e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.117398e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.112189 sec - 6,078,517,802 cycles # 2.874 GHz - 17,153,031,314 instructions # 2.82 insn per cycle - 2.116275288 seconds time elapsed +TOTAL : 2.075267 sec + 6,084,409,668 cycles # 2.929 GHz + 17,153,653,535 instructions # 2.82 insn per cycle + 2.079248239 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.674765e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.733725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.733725e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.734519e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795288e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.001010 sec - 2,696,240,098 cycles # 2.685 GHz - 6,276,404,164 instructions # 2.33 insn per cycle - 1.005076444 seconds time elapsed +TOTAL : 0.966505 sec + 2,688,725,058 cycles # 2.773 GHz + 6,276,779,038 instructions # 2.33 insn per cycle + 0.970429933 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.832147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.902384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.902384e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.877850e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949430e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949430e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.916582 sec - 2,498,079,452 cycles # 2.717 GHz - 5,903,755,317 instructions # 2.36 insn per cycle - 0.920755361 seconds time elapsed +TOTAL : 0.894055 sec + 2,494,096,931 cycles # 2.779 GHz + 5,903,813,574 instructions # 2.37 insn per cycle + 0.898011662 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -218,14 +212,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.388850e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.429977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.429977e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.442394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.485434e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.485434e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.204887 sec - 2,137,027,835 cycles # 1.769 GHz - 3,465,402,298 instructions # 1.62 insn per cycle - 1.209022745 seconds time elapsed +TOTAL : 1.160279 sec + 2,133,219,646 cycles # 1.834 GHz + 3,465,288,502 instructions # 1.62 insn per cycle + 1.164278641 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 1d664001ba..d72f4f129b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:22:52 +DATE: 2025-12-07_17:40:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.986981e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.577936e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.642909e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.098040e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.613039e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.669526e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 -TOTAL : 0.465752 sec - 2,027,464,804 cycles # 2.839 GHz - 2,776,602,524 instructions # 1.37 insn per cycle - 0.772091406 seconds time elapsed +TOTAL : 0.457316 sec + 2,011,347,364 cycles # 2.870 GHz + 2,811,739,081 instructions # 1.40 insn per cycle + 0.758160523 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 203 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.311817e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.830173e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.862677e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.350654e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.843471e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.872566e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.507862 sec - 2,193,078,964 cycles # 2.843 GHz - 3,061,556,319 instructions # 1.40 insn per cycle - 0.829701653 seconds time elapsed +TOTAL : 0.498524 sec + 2,190,264,377 cycles # 2.892 GHz + 3,075,940,882 instructions # 1.40 insn per cycle + 0.813902563 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.494083e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.506993e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.506993e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.558966e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.572265e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.572265e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.588418 sec - 19,053,983,564 cycles # 2.891 GHz - 59,396,932,644 instructions # 3.12 insn per cycle - 6.592397812 seconds time elapsed +TOTAL : 6.421016 sec + 19,180,532,523 cycles # 2.986 GHz + 59,554,927,817 instructions # 3.10 insn per cycle + 6.424812611 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 868) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +Avg ME (F77/C++) = 1.4129949097065833 +Relative difference = 6.390214879988402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.236693e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.382500e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.382500e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.574586e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.724385e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.724385e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.007204 sec - 5,773,782,949 cycles # 2.872 GHz - 16,883,450,737 instructions # 2.92 insn per cycle - 2.011190459 seconds time elapsed +TOTAL : 1.928009 sec + 5,774,847,357 cycles # 2.992 GHz + 16,883,276,498 instructions # 2.92 insn per cycle + 1.931759366 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.456033e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.499646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.499646e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.518749e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.564264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.564264e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.143466 sec - 3,080,089,782 cycles # 2.686 GHz - 6,901,917,276 instructions # 2.24 insn per cycle - 1.147397013 seconds time elapsed +TOTAL : 1.095783 sec + 3,078,383,775 cycles # 2.801 GHz + 6,902,209,433 instructions # 2.24 insn per cycle + 1.099714182 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5760) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.551832e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.601891e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.601891e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.635985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.689534e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.689534e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.074026 sec - 2,869,050,546 cycles # 2.664 GHz - 6,490,617,462 instructions # 2.26 insn per cycle - 1.077819814 seconds time elapsed +TOTAL : 1.019086 sec + 2,861,478,977 cycles # 2.799 GHz + 6,490,012,434 instructions # 2.27 insn per cycle + 1.022908962 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5562) (512y: 8) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.278723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313246e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.313246e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.348388e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.385615e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.385615e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.301798 sec - 2,284,363,028 cycles # 1.751 GHz - 3,800,071,631 instructions # 1.66 insn per cycle - 1.305803750 seconds time elapsed +TOTAL : 1.234608 sec + 2,280,388,038 cycles # 1.843 GHz + 3,799,887,441 instructions # 1.67 insn per cycle + 1.238386975 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2577) (512y: 9) (512z: 4061) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling index 61f28ab393..e3fa70ca70 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:42:03 +DATE: 2025-12-07_17:58:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -9.413980e+05 1 256 -1.824479e+06 2 256 -3.751768e+06 4 256 -6.821687e+06 8 256 -8.893057e+06 16 256 -1.069198e+07 32 256 -1.203562e+07 64 256 -1.299650e+07 128 256 -1.326879e+07 256 256 -1.353754e+07 512 256 -1.376766e+07 1024 256 +9.617841e+05 1 256 +1.983958e+06 2 256 +3.610999e+06 4 256 +7.118798e+06 8 256 +8.948037e+06 16 256 +1.085476e+07 32 256 +1.202932e+07 64 256 +1.296550e+07 128 256 +1.351441e+07 256 256 +1.352836e+07 512 256 +1.379502e+07 1024 256 ### GPU: scaling test 32 -1.264842e+05 1 32 -2.411881e+05 2 32 -5.002345e+05 4 32 -8.959915e+05 8 32 -1.929825e+06 16 32 -3.400412e+06 32 32 -6.965891e+06 64 32 -9.374242e+06 128 32 -1.031547e+07 256 32 -1.114517e+07 512 32 -1.169216e+07 1024 32 -1.186544e+07 2048 32 -1.211002e+07 4096 32 -1.215036e+07 8192 32 +1.151178e+05 1 32 +2.603658e+05 2 32 +4.583608e+05 4 32 +9.728624e+05 8 32 +1.897146e+06 16 32 +3.358985e+06 32 32 +7.127668e+06 64 32 +8.989455e+06 128 32 +1.048920e+07 256 32 +1.105514e+07 512 32 +1.170080e+07 1024 32 +1.191345e+07 2048 32 +1.211207e+07 4096 32 +1.213936e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.309135e+04 1 256 -2.331383e+04 2 256 -2.334383e+04 4 256 +2.315345e+04 1 256 +2.327132e+04 2 256 +2.355820e+04 4 256 ### CPU: scaling test 32 -2.173266e+04 1 32 -2.264555e+04 2 32 -2.214409e+04 4 32 +2.325181e+04 1 32 +2.273427e+04 2 32 +2.306218e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.454087e+04 1 256 -4.509478e+04 2 256 -4.547146e+04 4 256 +4.513881e+04 1 256 +4.503178e+04 2 256 +4.534207e+04 4 256 ### CPU: scaling test 32 -4.000635e+04 1 32 -4.240489e+04 2 32 -4.447787e+04 4 32 +4.297471e+04 1 32 +4.373870e+04 2 32 +4.509302e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.989478e+04 1 256 -8.788512e+04 2 256 -9.013990e+04 4 256 +9.135451e+04 1 256 +9.123285e+04 2 256 +9.073763e+04 4 256 ### CPU: scaling test 32 -9.025857e+04 1 32 -9.054908e+04 2 32 -8.932416e+04 4 32 +9.105913e+04 1 32 +9.070552e+04 2 32 +9.050068e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.982270e+04 1 256 -9.959330e+04 2 256 -9.964108e+04 4 256 +1.000780e+05 1 256 +9.946714e+04 2 256 +9.996705e+04 4 256 ### CPU: scaling test 32 -9.318362e+04 1 32 -1.002699e+05 2 32 -9.968832e+04 4 32 +1.002535e+05 1 32 +9.969981e+04 2 32 +9.910427e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.767141e+04 1 256 -6.818529e+04 2 256 -6.881658e+04 4 256 +6.893549e+04 1 256 +6.889920e+04 2 256 +6.913857e+04 4 256 ### CPU: scaling test 32 -6.813396e+04 1 32 -6.831571e+04 2 32 -6.860475e+04 4 32 +6.889203e+04 1 32 +6.988105e+04 2 32 +6.750499e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 66176b2229..d0241859f6 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:21:14 +DATE: 2025-12-07_17:38:41 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.723520e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.201379e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.219641e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.003330e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.207487e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.223989e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.472516 sec - 2,054,090,006 cycles # 2.841 GHz - 2,817,756,219 instructions # 1.37 insn per cycle - 0.780308929 seconds time elapsed +TOTAL : 0.464766 sec + 2,051,788,378 cycles # 2.895 GHz + 2,839,653,587 instructions # 1.38 insn per cycle + 0.766103998 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.127139e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.354786e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.367576e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.137731e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.361145e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.374138e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.567470 sec - 2,434,469,025 cycles # 2.854 GHz - 3,429,413,924 instructions # 1.41 insn per cycle - 0.911221936 seconds time elapsed +TOTAL : 0.556036 sec + 2,415,693,289 cycles # 2.910 GHz + 3,457,592,594 instructions # 1.43 insn per cycle + 0.889897915 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.325558e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.336921e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.336921e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.384793e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.396217e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.396217e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.066864 sec - 20,436,241,353 cycles # 2.891 GHz - 61,613,414,820 instructions # 3.01 insn per cycle - 7.070927861 seconds time elapsed +TOTAL : 6.890655 sec + 20,488,073,984 cycles # 2.972 GHz + 61,507,358,237 instructions # 3.00 insn per cycle + 6.894440901 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213859069593 -Relative difference = 4.345647726386255e-07 +Avg ME (F77/C++) = 1.4131213846377075 +Relative difference = 4.354629624727387e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.581252e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.624148e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.624148e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.698017e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.742277e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.742277e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.596315 sec - 10,491,200,280 cycles # 2.915 GHz - 30,713,063,869 instructions # 2.93 insn per cycle - 3.600269209 seconds time elapsed +TOTAL : 3.506876 sec + 10,505,657,477 cycles # 2.994 GHz + 30,722,371,968 instructions # 2.92 insn per cycle + 3.510807653 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213813302705 -Relative difference = 4.3780348012864624e-07 +Avg ME (F77/C++) = 1.4131213752166187 +Relative difference = 4.421298240727834e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.021587e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.189187e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.189187e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.347461e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.514638e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.514638e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.836324 sec - 4,963,572,150 cycles # 2.698 GHz - 11,329,877,800 instructions # 2.28 insn per cycle - 1.840366477 seconds time elapsed +TOTAL : 1.772815 sec + 4,966,681,171 cycles # 2.797 GHz + 11,317,864,622 instructions # 2.28 insn per cycle + 1.776841287 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4650) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.809724e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.000340e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000340e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.026582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.046729e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.046729e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.690468 sec - 4,546,028,597 cycles # 2.684 GHz - 10,641,089,172 instructions # 2.34 insn per cycle - 1.694422805 seconds time elapsed +TOTAL : 1.615460 sec + 4,541,378,515 cycles # 2.806 GHz + 10,626,329,249 instructions # 2.34 insn per cycle + 1.619441409 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4468) (512y: 47) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.931835e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.029866e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.029866e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.300959e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.403231e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.403231e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.386097 sec - 4,162,019,401 cycles # 1.742 GHz - 5,999,960,287 instructions # 1.44 insn per cycle - 2.390275923 seconds time elapsed +TOTAL : 2.265915 sec + 4,146,477,177 cycles # 1.828 GHz + 5,946,992,159 instructions # 1.43 insn per cycle + 2.269808848 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1724) (512y: 63) (512z: 3594) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling index d8428305ae..1ca8506d49 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:57:35 +DATE: 2025-12-07_18:14:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.849872e+05 1 256 -5.950036e+05 2 256 -1.135532e+06 4 256 -9.336754e+05 8 256 -2.668945e+06 16 256 -3.526097e+06 32 256 -4.045575e+06 64 256 -4.557983e+06 128 256 -4.782891e+06 256 256 -4.835057e+06 512 256 -4.861240e+06 1024 256 +2.946997e+05 1 256 +6.051401e+05 2 256 +1.167542e+06 4 256 +1.881417e+06 8 256 +2.770057e+06 16 256 +3.572491e+06 32 256 +4.111065e+06 64 256 +4.627948e+06 128 256 +4.771097e+06 256 256 +4.842070e+06 512 256 +4.862049e+06 1024 256 ### GPU: scaling test 32 -3.826136e+04 1 32 -7.325127e+04 2 32 -1.481027e+05 4 32 -3.040622e+05 8 32 -6.040500e+05 16 32 -1.089306e+06 32 32 -1.777835e+06 64 32 -2.826455e+06 128 32 -3.481738e+06 256 32 -3.995216e+06 512 32 -4.416099e+06 1024 32 -4.561881e+06 2048 32 -4.594627e+06 4096 32 -4.620875e+06 8192 32 +3.992624e+04 1 32 +7.880696e+04 2 32 +1.461509e+05 4 32 +3.056684e+05 8 32 +6.176794e+05 16 32 +1.151887e+06 32 32 +1.800628e+06 64 32 +2.879928e+06 128 32 +3.510954e+06 256 32 +4.022065e+06 512 32 +4.418294e+06 1024 32 +4.551658e+06 2048 32 +4.600691e+06 4096 32 +4.614581e+06 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.314037e+04 1 256 -2.324071e+04 2 256 -2.351748e+04 4 256 +2.306991e+04 1 256 +2.320955e+04 2 256 +2.318267e+04 4 256 ### CPU: scaling test 32 -2.156289e+04 1 32 -2.224284e+04 2 32 -2.270647e+04 4 32 +2.317974e+04 1 32 +2.315920e+04 2 32 +2.248902e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.464955e+04 1 256 -4.456312e+04 2 256 -4.557593e+04 4 256 +4.437779e+04 1 256 +4.501804e+04 2 256 +4.496047e+04 4 256 ### CPU: scaling test 32 -3.776841e+04 1 32 -4.243663e+04 2 32 -4.407623e+04 4 32 +3.887458e+04 1 32 +4.206663e+04 2 32 +4.338420e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.329077e+04 1 256 -8.946504e+04 2 256 -8.934937e+04 4 256 +8.486061e+04 1 256 +8.424664e+04 2 256 +8.527915e+04 4 256 ### CPU: scaling test 32 -8.542423e+04 1 32 -9.061011e+04 2 32 -9.100728e+04 4 32 +8.326872e+04 1 32 +8.438585e+04 2 32 +8.423613e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.619475e+04 1 256 -1.000794e+05 2 256 -9.841918e+04 4 256 +9.982461e+04 1 256 +9.225893e+04 2 256 +9.147032e+04 4 256 ### CPU: scaling test 32 -9.793151e+04 1 32 -9.901818e+04 2 32 -9.971627e+04 4 32 +8.901524e+04 1 32 +9.212743e+04 2 32 +9.172235e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.804216e+04 1 256 -6.812091e+04 2 256 -6.863263e+04 4 256 +6.332157e+04 1 256 +6.634153e+04 2 256 +6.563831e+04 4 256 ### CPU: scaling test 32 -6.817141e+04 1 32 -6.704119e+04 2 32 -6.858619e+04 4 32 +6.169745e+04 1 32 +6.278862e+04 2 32 +6.459212e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index b5540e725a..da8e41ff70 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:21:49 +DATE: 2025-12-07_17:39:16 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.729045e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.193827e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.214345e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.949654e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197968e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.214873e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.476302 sec - 2,069,585,848 cycles # 2.841 GHz - 2,809,792,568 instructions # 1.36 insn per cycle - 0.788016398 seconds time elapsed +TOTAL : 0.470094 sec + 2,020,451,589 cycles # 2.823 GHz + 2,801,289,024 instructions # 1.39 insn per cycle + 0.772790683 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.148157e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.386565e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.400273e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.152138e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.381834e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.395362e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.562536 sec - 2,368,600,308 cycles # 2.829 GHz - 3,390,907,468 instructions # 1.43 insn per cycle - 0.897403591 seconds time elapsed +TOTAL : 0.553776 sec + 2,418,660,927 cycles # 2.913 GHz + 3,491,764,141 instructions # 1.44 insn per cycle + 0.888716899 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.347035e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.358476e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.358476e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.413745e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.425611e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.425611e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.001676 sec - 20,340,735,873 cycles # 2.904 GHz - 61,296,698,560 instructions # 3.01 insn per cycle - 7.005669304 seconds time elapsed +TOTAL : 6.808321 sec + 20,332,936,356 cycles # 2.985 GHz + 61,191,329,416 instructions # 3.01 insn per cycle + 6.812145379 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213859069593 -Relative difference = 4.345647726386255e-07 +Avg ME (F77/C++) = 1.4131213846377075 +Relative difference = 4.354629624727387e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.588929e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.632804e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.632804e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.742670e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.786085e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.786085e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.590204 sec - 10,378,021,696 cycles # 2.888 GHz - 30,395,025,188 instructions # 2.93 insn per cycle - 3.594207111 seconds time elapsed +TOTAL : 3.473626 sec + 10,391,866,744 cycles # 2.989 GHz + 30,403,708,350 instructions # 2.93 insn per cycle + 3.477403856 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4954) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213813302705 -Relative difference = 4.3780348012864624e-07 +Avg ME (F77/C++) = 1.4131213752166187 +Relative difference = 4.421298240727834e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.624880e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.780155e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.780155e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.997495e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.151792e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.151792e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.920064 sec - 5,168,529,008 cycles # 2.687 GHz - 11,822,995,259 instructions # 2.29 insn per cycle - 1.924192404 seconds time elapsed +TOTAL : 1.840116 sec + 5,160,576,065 cycles # 2.800 GHz + 11,811,687,682 instructions # 2.29 insn per cycle + 1.844014864 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4749) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.374636e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.559382e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.559382e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.816926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.000177e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000177e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.767863 sec - 4,740,196,866 cycles # 2.676 GHz - 11,146,224,662 instructions # 2.35 insn per cycle - 1.772001982 seconds time elapsed +TOTAL : 1.688224 sec + 4,726,707,634 cycles # 2.795 GHz + 11,131,435,655 instructions # 2.36 insn per cycle + 1.692235132 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4420) (512y: 221) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.914882e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.012925e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.012925e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.262526e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.365368e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.365368e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.391894 sec - 4,182,595,672 cycles # 1.747 GHz - 6,238,269,996 instructions # 1.49 insn per cycle - 2.395956127 seconds time elapsed +TOTAL : 2.277276 sec + 4,173,449,291 cycles # 1.831 GHz + 6,185,519,253 instructions # 1.48 insn per cycle + 2.281277198 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1623) (512y: 120) (512z: 3678) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling index 5a05ffd4cc..ac35595cad 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:42:45 +DATE: 2025-12-07_17:59:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.797622e+05 1 256 -3.709787e+05 2 256 -3.836692e+05 4 256 -4.274394e+05 8 256 -4.457291e+05 16 256 -4.426930e+05 32 256 -4.430121e+05 64 256 -4.414634e+05 128 256 -4.537983e+05 256 256 -4.587406e+05 512 256 -4.539498e+05 1024 256 +2.824139e+05 1 256 +3.659501e+05 2 256 +3.820365e+05 4 256 +4.289044e+05 8 256 +4.468056e+05 16 256 +4.428859e+05 32 256 +4.439936e+05 64 256 +4.394787e+05 128 256 +4.499273e+05 256 256 +4.545834e+05 512 256 +4.542711e+05 1024 256 ### GPU: scaling test 32 -5.646557e+04 1 32 -1.072891e+05 2 32 -1.807325e+05 4 32 -2.717613e+05 8 32 -3.826661e+05 16 32 -3.951829e+05 32 32 -4.316071e+05 64 32 -4.432349e+05 128 32 -4.449540e+05 256 32 -4.447744e+05 512 32 -4.444094e+05 1024 32 -4.520916e+05 2048 32 -4.578060e+05 4096 32 -4.571634e+05 8192 32 +5.758368e+04 1 32 +1.100837e+05 2 32 +1.387761e+05 4 32 +2.731363e+05 8 32 +3.949453e+05 16 32 +3.962080e+05 32 32 +4.319051e+05 64 32 +4.448655e+05 128 32 +4.464887e+05 256 32 +4.497738e+05 512 32 +4.422157e+05 1024 32 +4.516828e+05 2048 32 +4.557755e+05 4096 32 +4.568166e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.852732e+03 1 256 -1.852838e+03 2 256 -1.863778e+03 4 256 +1.868713e+03 1 256 +1.924807e+03 2 256 +1.920719e+03 4 256 ### CPU: scaling test 32 -1.849128e+03 1 32 -1.851000e+03 2 32 -1.853111e+03 4 32 +1.901842e+03 1 32 +1.864500e+03 2 32 +1.916030e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.433326e+03 1 256 -3.428849e+03 2 256 -3.434375e+03 4 256 +3.470582e+03 1 256 +3.387915e+03 2 256 +3.489997e+03 4 256 ### CPU: scaling test 32 -3.324011e+03 1 32 -3.385678e+03 2 32 -3.337661e+03 4 32 +3.387852e+03 1 32 +3.407104e+03 2 32 +3.410327e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.888262e+03 1 256 -7.910674e+03 2 256 -7.940995e+03 4 256 +7.951708e+03 1 256 +8.184519e+03 2 256 +8.036752e+03 4 256 ### CPU: scaling test 32 -7.181194e+03 1 32 -7.616753e+03 2 32 -7.493920e+03 4 32 +7.731847e+03 1 32 +7.789637e+03 2 32 +7.794909e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.845276e+03 1 256 -8.896166e+03 2 256 -8.958296e+03 4 256 +9.322153e+03 1 256 +9.095826e+03 2 256 +9.248215e+03 4 256 ### CPU: scaling test 32 -8.632795e+03 1 32 -8.574113e+03 2 32 -8.618805e+03 4 32 +8.613949e+03 1 32 +8.755614e+03 2 32 +8.868215e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.742240e+03 1 256 -6.762831e+03 2 256 -6.833848e+03 4 256 +6.950360e+03 1 256 +7.125878e+03 2 256 +7.021509e+03 4 256 ### CPU: scaling test 32 -6.602630e+03 1 32 -6.602109e+03 2 32 -6.640282e+03 4 32 +6.629400e+03 1 32 +6.570680e+03 2 32 +6.666150e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 5da31552e6..49a68b5c66 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:23:20 +DATE: 2025-12-07_17:40:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.393219e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.441536e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.444704e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.432936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.474358e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477129e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.499467 sec - 2,136,562,888 cycles # 2.840 GHz - 3,115,290,958 instructions # 1.46 insn per cycle - 0.813463478 seconds time elapsed +TOTAL : 0.495304 sec + 2,178,492,604 cycles # 2.913 GHz + 3,141,593,635 instructions # 1.44 insn per cycle + 0.807450699 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.853765e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.854661e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.854661e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.874624e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.875560e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.875560e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.853472 sec - 25,658,433,103 cycles # 2.897 GHz - 78,568,001,018 instructions # 3.06 insn per cycle - 8.857417932 seconds time elapsed +TOTAL : 8.754992 sec + 25,635,816,627 cycles # 2.927 GHz + 78,334,414,389 instructions # 3.06 insn per cycle + 8.758915025 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.376471e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.379465e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.379465e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.509149e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.512369e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512369e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.863682 sec - 13,076,523,489 cycles # 2.687 GHz - 39,590,979,607 instructions # 3.03 insn per cycle - 4.867732270 seconds time elapsed +TOTAL : 4.679502 sec + 13,073,353,892 cycles # 2.792 GHz + 39,589,661,629 instructions # 3.03 insn per cycle + 4.683486486 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.895651e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.911901e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.911901e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.806777e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.821939e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.821939e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.083250 sec - 5,645,439,415 cycles # 2.706 GHz - 13,860,388,601 instructions # 2.46 insn per cycle - 2.087459740 seconds time elapsed +TOTAL : 2.110100 sec + 5,645,623,386 cycles # 2.675 GHz + 13,861,313,472 instructions # 2.46 insn per cycle + 2.116083833 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.894010e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.914275e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.914275e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.113305e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.134375e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.134375e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.850375 sec - 5,008,092,310 cycles # 2.702 GHz - 12,556,513,170 instructions # 2.51 insn per cycle - 1.855114099 seconds time elapsed +TOTAL : 1.805599 sec + 5,001,627,349 cycles # 2.765 GHz + 12,556,113,217 instructions # 2.51 insn per cycle + 1.809682693 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.736940e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.749376e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.749376e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.072870e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.085587e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.085587e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.440997 sec - 4,200,411,405 cycles # 1.718 GHz - 6,424,496,970 instructions # 1.53 insn per cycle - 2.445446290 seconds time elapsed +TOTAL : 2.324852 sec + 4,196,280,668 cycles # 1.803 GHz + 6,424,134,453 instructions # 1.53 insn per cycle + 2.328880173 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling index 30ffb7f326..0dfb2244a2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:58:57 +DATE: 2025-12-07_18:15:35 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.872973e+05 1 256 -2.845184e+05 2 256 -3.112851e+05 4 256 -3.602269e+05 8 256 -3.862982e+05 16 256 -3.927910e+05 32 256 -3.975811e+05 64 256 -3.994813e+05 128 256 -3.982764e+05 256 256 -4.044121e+05 512 256 -4.143519e+05 1024 256 +1.884474e+05 1 256 +2.683110e+05 2 256 +3.156878e+05 4 256 +3.600746e+05 8 256 +3.847094e+05 16 256 +3.954558e+05 32 256 +3.975148e+05 64 256 +3.950837e+05 128 256 +4.113732e+05 256 256 +4.031868e+05 512 256 +4.122711e+05 1024 256 ### GPU: scaling test 32 -3.147853e+04 1 32 -5.985873e+04 2 32 -1.086414e+05 4 32 -1.846072e+05 8 32 -2.795140e+05 16 32 -3.171308e+05 32 32 -3.664746e+05 64 32 -3.861934e+05 128 32 -3.935760e+05 256 32 -3.959241e+05 512 32 -3.999573e+05 1024 32 -4.014811e+05 2048 32 -4.043590e+05 4096 32 -4.145995e+05 8192 32 +3.385204e+04 1 32 +6.420133e+04 2 32 +1.152125e+05 4 32 +1.915590e+05 8 32 +2.905411e+05 16 32 +3.224594e+05 32 32 +3.664709e+05 64 32 +3.877008e+05 128 32 +3.943289e+05 256 32 +3.973742e+05 512 32 +3.959634e+05 1024 32 +4.111598e+05 2048 32 +4.057600e+05 4096 32 +4.143630e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.851734e+03 1 256 -1.852841e+03 2 256 -1.858966e+03 4 256 +1.922029e+03 1 256 +1.893998e+03 2 256 +1.843136e+03 4 256 ### CPU: scaling test 32 -1.839862e+03 1 32 -1.843418e+03 2 32 -1.855242e+03 4 32 +1.817453e+03 1 32 +1.838361e+03 2 32 +1.844916e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.376740e+03 1 256 -3.427003e+03 2 256 -3.418754e+03 4 256 +3.365985e+03 1 256 +3.369115e+03 2 256 +3.505332e+03 4 256 ### CPU: scaling test 32 -3.343494e+03 1 32 -3.346688e+03 2 32 -3.350028e+03 4 32 +3.354910e+03 1 32 +3.367130e+03 2 32 +3.433780e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.930406e+03 1 256 -7.927403e+03 2 256 -7.830665e+03 4 256 +7.986162e+03 1 256 +8.017301e+03 2 256 +8.166784e+03 4 256 ### CPU: scaling test 32 -7.705971e+03 1 32 -7.749828e+03 2 32 -7.499380e+03 4 32 +7.716102e+03 1 32 +7.842334e+03 2 32 +7.833804e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.438432e+03 1 256 -8.876320e+03 2 256 -8.867251e+03 4 256 +9.045751e+03 1 256 +9.008977e+03 2 256 +9.201525e+03 4 256 ### CPU: scaling test 32 -8.678830e+03 1 32 -8.575889e+03 2 32 -8.706424e+03 4 32 +8.625339e+03 1 32 +8.761732e+03 2 32 +8.642882e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.649041e+03 1 256 -6.668160e+03 2 256 -6.667655e+03 4 256 +6.930824e+03 1 256 +7.096463e+03 2 256 +7.178522e+03 4 256 ### CPU: scaling test 32 -6.543129e+03 1 32 -6.626562e+03 2 32 -6.609869e+03 4 32 +6.589302e+03 1 32 +6.801088e+03 2 32 +6.751847e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt index ef3556442f..ca32dec5d3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:52:22 +DATE: 2025-12-07_18:09:08 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.934631e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.970660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.973586e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.932642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.966781e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.969386e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.207545 sec - 4,504,483,186 cycles # 2.857 GHz - 6,247,204,557 instructions # 1.39 insn per cycle - 1.634328522 seconds time elapsed +TOTAL : 1.159745 sec + 4,478,218,470 cycles # 2.927 GHz + 6,211,050,064 instructions # 1.39 insn per cycle + 1.591221272 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.840362e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.841255e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.841255e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.912488e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.913449e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.913449e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.917657 sec - 25,674,151,776 cycles # 2.878 GHz - 78,572,254,617 instructions # 3.06 insn per cycle - 8.921718104 seconds time elapsed +TOTAL : 8.581733 sec + 25,629,586,992 cycles # 2.986 GHz + 78,331,581,474 instructions # 3.06 insn per cycle + 8.585694420 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.319765e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.322676e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.322676e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.502469e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.505599e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505599e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.946260 sec - 13,085,012,778 cycles # 2.644 GHz - 39,592,390,137 instructions # 3.03 insn per cycle - 4.950371272 seconds time elapsed +TOTAL : 4.688943 sec + 13,086,683,830 cycles # 2.789 GHz + 39,590,063,635 instructions # 3.03 insn per cycle + 4.692884045 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.807824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.823601e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.823601e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.079723e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.095951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.095951e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.106755 sec - 5,651,241,480 cycles # 2.678 GHz - 13,863,632,897 instructions # 2.45 insn per cycle - 2.110867653 seconds time elapsed +TOTAL : 2.036371 sec + 5,639,068,289 cycles # 2.765 GHz + 13,860,236,649 instructions # 2.46 insn per cycle + 2.040498172 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.771177e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.791107e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.791107e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.025617e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.046735e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.046735e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.876075 sec - 5,022,531,784 cycles # 2.673 GHz - 12,559,680,227 instructions # 2.50 insn per cycle - 1.880203925 seconds time elapsed +TOTAL : 1.823135 sec + 4,997,929,727 cycles # 2.737 GHz + 12,556,417,181 instructions # 2.51 insn per cycle + 1.827032129 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.686685e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.698350e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.698350e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.078204e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.090846e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.090846e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.459028 sec - 4,208,203,803 cycles # 1.709 GHz - 6,429,086,120 instructions # 1.53 insn per cycle - 2.463275806 seconds time elapsed +TOTAL : 2.323628 sec + 4,198,741,229 cycles # 1.805 GHz + 6,424,131,886 instructions # 1.53 insn per cycle + 2.327588097 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index afbbcacb7a..1f942fa68c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:31:19 +DATE: 2025-12-07_18:51:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.849435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.385880e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.385880e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.851923e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.382378e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.382378e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.489334 sec - 2,114,311,442 cycles # 2.842 GHz - 3,127,238,641 instructions # 1.48 insn per cycle - 0.800689166 seconds time elapsed +TOTAL : 0.487600 sec + 2,132,500,387 cycles # 2.888 GHz + 3,143,076,569 instructions # 1.47 insn per cycle + 0.797041092 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.851000e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.851887e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.851887e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.893605e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.894534e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894534e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.871032 sec - 25,693,998,933 cycles # 2.896 GHz - 78,573,360,631 instructions # 3.06 insn per cycle - 8.875307913 seconds time elapsed +TOTAL : 8.671311 sec + 25,664,851,871 cycles # 2.959 GHz + 78,342,649,953 instructions # 3.05 insn per cycle + 8.675577382 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.388018e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.391044e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.391044e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.432088e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.435152e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.435152e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.851540 sec - 13,088,956,582 cycles # 2.696 GHz - 39,603,859,010 instructions # 3.03 insn per cycle - 4.856264549 seconds time elapsed +TOTAL : 4.788968 sec + 13,092,263,661 cycles # 2.732 GHz + 39,603,794,843 instructions # 3.02 insn per cycle + 4.793258894 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.795496e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.810972e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.810972e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.074590e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.091404e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.091404e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.115018 sec - 5,684,762,872 cycles # 2.683 GHz - 13,871,040,440 instructions # 2.44 insn per cycle - 2.119380961 seconds time elapsed +TOTAL : 2.041504 sec + 5,656,838,282 cycles # 2.766 GHz + 13,870,885,068 instructions # 2.45 insn per cycle + 2.045674323 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.855184e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.876301e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.876301e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.117977e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.140163e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.140163e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.862992 sec - 5,028,827,648 cycles # 2.694 GHz - 12,567,491,832 instructions # 2.50 insn per cycle - 1.867563931 seconds time elapsed +TOTAL : 1.809332 sec + 5,030,593,601 cycles # 2.775 GHz + 12,567,641,114 instructions # 2.50 insn per cycle + 1.813603731 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.712981e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.724915e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.724915e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.003610e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.016863e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.016863e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.454832 sec - 4,213,905,835 cycles # 1.714 GHz - 6,436,340,551 instructions # 1.53 insn per cycle - 2.459274611 seconds time elapsed +TOTAL : 2.351960 sec + 4,213,153,460 cycles # 1.789 GHz + 6,436,189,653 instructions # 1.53 insn per cycle + 2.356211822 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index d4d5e2b45e..9d218cdae5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:44:57 +DATE: 2025-12-07_19:04:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.369462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.419383e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.422637e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.373077e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.418282e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.421254e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.487281 sec - 2,090,605,611 cycles # 2.842 GHz - 3,063,541,899 instructions # 1.47 insn per cycle - 0.797172689 seconds time elapsed +TOTAL : 0.479562 sec + 2,134,672,771 cycles # 2.914 GHz + 3,164,286,470 instructions # 1.48 insn per cycle + 0.789473188 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.849332e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.850241e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.850241e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.915241e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.916182e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.916182e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.876225 sec - 25,662,776,506 cycles # 2.890 GHz - 78,567,147,731 instructions # 3.06 insn per cycle - 8.880187224 seconds time elapsed +TOTAL : 8.570683 sec + 25,646,300,668 cycles # 2.992 GHz + 78,330,929,447 instructions # 3.05 insn per cycle + 8.574588810 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.358067e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.361108e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.361108e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.507245e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.510376e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.510376e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.892312 sec - 13,068,286,128 cycles # 2.669 GHz - 39,590,526,259 instructions # 3.03 insn per cycle - 4.896571237 seconds time elapsed +TOTAL : 4.684296 sec + 13,092,688,606 cycles # 2.794 GHz + 39,591,058,544 instructions # 3.02 insn per cycle + 4.688099844 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.827564e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.843333e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.843333e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.136934e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.153569e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.153569e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.103410 sec - 5,668,034,580 cycles # 2.691 GHz - 13,860,472,796 instructions # 2.45 insn per cycle - 2.107462678 seconds time elapsed +TOTAL : 2.023538 sec + 5,649,498,477 cycles # 2.788 GHz + 13,859,217,639 instructions # 2.45 insn per cycle + 2.027441318 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.833416e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.853413e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.853413e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.190175e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.211590e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.211590e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.864637 sec - 5,021,320,374 cycles # 2.689 GHz - 12,554,612,891 instructions # 2.50 insn per cycle - 1.868702414 seconds time elapsed +TOTAL : 1.792137 sec + 5,015,775,118 cycles # 2.794 GHz + 12,554,408,840 instructions # 2.50 insn per cycle + 1.796010274 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.674295e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.686265e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.686265e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.852920e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.865596e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.865596e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.465332 sec - 4,203,800,820 cycles # 1.703 GHz - 6,422,604,226 instructions # 1.53 insn per cycle - 2.469400350 seconds time elapsed +TOTAL : 2.401145 sec + 4,203,610,574 cycles # 1.749 GHz + 6,422,770,487 instructions # 1.53 insn per cycle + 2.405188253 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 2beaf322b6..c596850cb1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:41:27 +DATE: 2025-12-07_19:00:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.390277e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.431631e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.434858e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.372468e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.417754e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.420802e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.485227 sec - 2,088,179,344 cycles # 2.833 GHz - 3,069,782,317 instructions # 1.47 insn per cycle - 0.797220882 seconds time elapsed +TOTAL : 0.482378 sec + 2,108,737,491 cycles # 2.889 GHz + 3,140,622,430 instructions # 1.49 insn per cycle + 0.791837229 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.841686e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.842564e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.842564e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.916600e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.917571e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.917571e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.911703 sec - 25,672,385,298 cycles # 2.880 GHz - 78,567,422,772 instructions # 3.06 insn per cycle - 8.915910048 seconds time elapsed +TOTAL : 8.563316 sec + 25,637,689,775 cycles # 2.993 GHz + 78,330,571,977 instructions # 3.06 insn per cycle + 8.567238721 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.377610e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.380670e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.380670e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.460688e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.463720e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.463720e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.861995 sec - 13,083,483,284 cycles # 2.689 GHz - 39,590,790,279 instructions # 3.03 insn per cycle - 4.866021467 seconds time elapsed +TOTAL : 4.745180 sec + 13,092,812,550 cycles # 2.758 GHz + 39,590,494,839 instructions # 3.02 insn per cycle + 4.749075654 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.782247e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.797307e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.797307e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.062631e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.079348e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.079348e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.113995 sec - 5,648,509,407 cycles # 2.668 GHz - 13,860,950,299 instructions # 2.45 insn per cycle - 2.118130954 seconds time elapsed +TOTAL : 2.040198 sec + 5,639,089,206 cycles # 2.760 GHz + 13,860,209,305 instructions # 2.46 insn per cycle + 2.044178615 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.815640e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.835781e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.835781e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.195017e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.216588e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.216588e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.866689 sec - 5,013,333,127 cycles # 2.681 GHz - 12,556,528,301 instructions # 2.50 insn per cycle - 1.870730508 seconds time elapsed +TOTAL : 1.789593 sec + 5,012,074,785 cycles # 2.796 GHz + 12,556,782,321 instructions # 2.51 insn per cycle + 1.793552449 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.601628e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.612890e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.612890e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.004377e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.017063e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.017063e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.490563 sec - 4,200,883,402 cycles # 1.685 GHz - 6,425,171,149 instructions # 1.53 insn per cycle - 2.494555434 seconds time elapsed +TOTAL : 2.347565 sec + 4,198,017,965 cycles # 1.786 GHz + 6,424,077,549 instructions # 1.53 insn per cycle + 2.351400509 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt index 2815ba1af8..c7835b00c3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:50:33 +DATE: 2025-12-07_19:16:47 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.400466e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.444219e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.447053e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.398542e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.443468e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.446246e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.504359 sec - 2,085,179,396 cycles # 2.830 GHz - 3,096,904,235 instructions # 1.49 insn per cycle - 0.798389923 seconds time elapsed +TOTAL : 0.500465 sec + 2,116,975,105 cycles # 2.892 GHz + 3,166,341,420 instructions # 1.50 insn per cycle + 0.801283200 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.851668e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.852556e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.852556e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.914032e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.914967e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914967e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.863632 sec - 25,676,607,785 cycles # 2.896 GHz - 78,566,655,326 instructions # 3.06 insn per cycle - 8.867760313 seconds time elapsed +TOTAL : 8.575102 sec + 25,637,374,835 cycles # 2.989 GHz + 78,330,500,739 instructions # 3.06 insn per cycle + 8.579127719 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.364733e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.367766e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.367766e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.461982e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.465097e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.465097e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.880672 sec - 13,087,360,743 cycles # 2.680 GHz - 39,590,709,537 instructions # 3.03 insn per cycle - 4.884841575 seconds time elapsed +TOTAL : 4.743440 sec + 13,090,280,742 cycles # 2.759 GHz + 39,590,905,664 instructions # 3.02 insn per cycle + 4.747349345 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.891642e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.907720e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.907720e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.139094e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.155816e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.155816e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.084604 sec - 5,646,655,758 cycles # 2.704 GHz - 13,860,514,996 instructions # 2.45 insn per cycle - 2.088799789 seconds time elapsed +TOTAL : 2.020846 sec + 5,639,365,318 cycles # 2.786 GHz + 13,860,096,307 instructions # 2.46 insn per cycle + 2.024947566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.832886e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.853061e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.853061e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.173766e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.195240e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.195240e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.862981 sec - 5,001,186,272 cycles # 2.680 GHz - 12,556,644,714 instructions # 2.51 insn per cycle - 1.867187074 seconds time elapsed +TOTAL : 1.794082 sec + 5,007,375,926 cycles # 2.786 GHz + 12,556,473,965 instructions # 2.51 insn per cycle + 1.798111746 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.594055e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.605629e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.605629e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.079884e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.092898e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.092898e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.493451 sec - 4,195,828,592 cycles # 1.681 GHz - 6,424,665,239 instructions # 1.53 insn per cycle - 2.497646028 seconds time elapsed +TOTAL : 2.322789 sec + 4,199,103,446 cycles # 1.805 GHz + 6,424,071,774 instructions # 1.53 insn per cycle + 2.326811979 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 0158323c78..57adc98f16 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:38:00 +DATE: 2025-12-07_18:57:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.928428e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.433382e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.436767e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.929644e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.423442e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.426485e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.486860 sec - 2,086,798,241 cycles # 2.826 GHz - 3,070,254,605 instructions # 1.47 insn per cycle - 0.797700561 seconds time elapsed +TOTAL : 0.482312 sec + 2,128,553,520 cycles # 2.910 GHz + 3,132,010,332 instructions # 1.47 insn per cycle + 0.791208990 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.846748e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.847641e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.847641e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.884820e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.885748e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.885748e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.887132 sec - 25,658,141,408 cycles # 2.886 GHz - 78,568,113,694 instructions # 3.06 insn per cycle - 8.891273835 seconds time elapsed +TOTAL : 8.707602 sec + 25,666,300,361 cycles # 2.947 GHz + 78,346,698,554 instructions # 3.05 insn per cycle + 8.711512026 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.370014e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.373021e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.373021e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.462452e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.465596e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.465596e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.872933 sec - 13,079,305,653 cycles # 2.683 GHz - 39,591,036,555 instructions # 3.03 insn per cycle - 4.877066552 seconds time elapsed +TOTAL : 4.742880 sec + 13,074,173,272 cycles # 2.755 GHz + 39,590,591,508 instructions # 3.03 insn per cycle + 4.746733657 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.876108e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.892295e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.892295e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.031626e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.047544e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.047544e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.088702 sec - 5,640,399,522 cycles # 2.696 GHz - 13,860,298,624 instructions # 2.46 insn per cycle - 2.092763612 seconds time elapsed +TOTAL : 2.047843 sec + 5,650,062,131 cycles # 2.755 GHz + 13,864,104,536 instructions # 2.45 insn per cycle + 2.051716280 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.890465e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.910782e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.910782e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.072627e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.092449e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.092449e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.851027 sec - 4,999,453,261 cycles # 2.696 GHz - 12,556,321,373 instructions # 2.51 insn per cycle - 1.855011471 seconds time elapsed +TOTAL : 1.813551 sec + 5,008,288,159 cycles # 2.757 GHz + 12,556,899,010 instructions # 2.51 insn per cycle + 1.817499657 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.623877e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.635346e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.635346e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.976867e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.989515e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.989515e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.482437 sec - 4,198,161,225 cycles # 1.689 GHz - 6,424,537,434 instructions # 1.53 insn per cycle - 2.486588561 seconds time elapsed +TOTAL : 2.356742 sec + 4,197,335,129 cycles # 1.779 GHz + 6,424,824,726 instructions # 1.53 insn per cycle + 2.360833423 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index f41a7b9938..b1a92d0391 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:24:03 +DATE: 2025-12-07_17:41:26 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.429377e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.477740e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.480923e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.439036e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.479672e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.482478e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.500889 sec - 2,161,311,557 cycles # 2.855 GHz - 3,140,076,215 instructions # 1.45 insn per cycle - 0.823418290 seconds time elapsed +TOTAL : 0.496222 sec + 2,168,408,288 cycles # 2.897 GHz + 3,176,074,422 instructions # 1.46 insn per cycle + 0.809492711 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.849400e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.850323e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.850323e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.849656e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850540e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850540e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.874198 sec - 25,611,778,767 cycles # 2.885 GHz - 78,652,591,485 instructions # 3.07 insn per cycle - 8.878147244 seconds time elapsed +TOTAL : 8.872482 sec + 25,630,718,684 cycles # 2.888 GHz + 78,416,592,128 instructions # 3.06 insn per cycle + 8.876234715 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.379484e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.382464e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.382464e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.435042e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.438113e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.438113e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.859162 sec - 13,089,109,626 cycles # 2.692 GHz - 39,515,404,087 instructions # 3.02 insn per cycle - 4.863216879 seconds time elapsed +TOTAL : 4.780370 sec + 13,080,838,220 cycles # 2.736 GHz + 39,513,664,897 instructions # 3.02 insn per cycle + 4.784378697 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13022) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.837369e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.853285e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.853285e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.061160e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.077007e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.077007e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.098643 sec - 5,677,190,930 cycles # 2.701 GHz - 13,961,575,914 instructions # 2.46 insn per cycle - 2.102810449 seconds time elapsed +TOTAL : 2.040381 sec + 5,678,126,139 cycles # 2.779 GHz + 13,961,262,784 instructions # 2.46 insn per cycle + 2.044311663 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11630) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.705091e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.724821e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.724821e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.185099e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.206698e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.206698e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.889961 sec - 5,055,738,073 cycles # 2.670 GHz - 12,659,664,704 instructions # 2.50 insn per cycle - 1.894052230 seconds time elapsed +TOTAL : 1.791711 sec + 5,010,384,238 cycles # 2.792 GHz + 12,659,457,552 instructions # 2.53 insn per cycle + 1.795580160 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10483) (512y: 226) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.677757e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.689492e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.689492e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.118875e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.131406e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.131406e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.462163 sec - 4,206,188,103 cycles # 1.706 GHz - 6,542,388,485 instructions # 1.56 insn per cycle - 2.466313710 seconds time elapsed +TOTAL : 2.309620 sec + 4,202,683,631 cycles # 1.817 GHz + 6,542,225,167 instructions # 1.56 insn per cycle + 2.313838474 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1764) (512y: 185) (512z: 9379) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index b05fc67f3a..70489ffe6b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:20:09 +DATE: 2025-12-07_18:40:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.097347e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.099827e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.048609e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.087120e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.089792e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.501512 sec - 2,120,097,032 cycles # 2.815 GHz - 3,067,817,522 instructions # 1.45 insn per cycle - 0.823770320 seconds time elapsed +TOTAL : 0.500406 sec + 2,178,671,946 cycles # 2.896 GHz + 3,154,423,709 instructions # 1.45 insn per cycle + 0.821781396 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.202543e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.203008e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.203008e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.313935e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.314418e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.314418e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.031219 sec - 112,588,276,317 cycles # 2.885 GHz - 142,621,877,493 instructions # 1.27 insn per cycle - 39.035229334 seconds time elapsed +TOTAL : 38.023167 sec + 112,573,177,609 cycles # 2.961 GHz + 142,384,276,945 instructions # 1.26 insn per cycle + 38.027324535 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20355) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.909352e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.911559e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.911559e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.050393e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.052807e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.052807e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.643908 sec - 15,024,056,162 cycles # 2.661 GHz - 37,385,323,408 instructions # 2.49 insn per cycle - 5.648271623 seconds time elapsed +TOTAL : 5.382873 sec + 15,022,516,544 cycles # 2.789 GHz + 37,385,389,525 instructions # 2.49 insn per cycle + 5.386887739 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:67523) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.457222e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.471736e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.471736e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.726848e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.741838e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.741838e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.205981 sec - 5,946,476,110 cycles # 2.692 GHz - 12,809,216,170 instructions # 2.15 insn per cycle - 2.210041352 seconds time elapsed +TOTAL : 2.129140 sec + 5,941,046,141 cycles # 2.786 GHz + 12,807,904,314 instructions # 2.16 insn per cycle + 2.133233300 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45792) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.156302e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.178569e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.178569e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.510024e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.534081e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.534081e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.797567 sec - 4,817,758,417 cycles # 2.675 GHz - 11,422,908,794 instructions # 2.37 insn per cycle - 1.801731550 seconds time elapsed +TOTAL : 1.730951 sec + 4,816,756,538 cycles # 2.777 GHz + 11,423,499,857 instructions # 2.37 insn per cycle + 1.735056687 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40102) (512y: 282) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.936851e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.949204e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.949204e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.412243e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.426456e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.426456e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.370929 sec - 4,028,743,609 cycles # 1.697 GHz - 5,966,081,307 instructions # 1.48 insn per cycle - 2.375198937 seconds time elapsed +TOTAL : 2.219032 sec + 4,018,872,014 cycles # 1.809 GHz + 5,965,269,424 instructions # 1.48 insn per cycle + 2.223063786 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2453) (512y: 337) (512z:39235) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 10c6792da9..b6fc969e5a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:21:27 +DATE: 2025-12-07_18:41:48 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.079972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.118608e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.121448e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.084961e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.125744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.128490e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.505348 sec - 2,147,536,542 cycles # 2.834 GHz - 3,073,502,942 instructions # 1.43 insn per cycle - 0.816880103 seconds time elapsed +TOTAL : 0.497812 sec + 2,182,220,696 cycles # 2.916 GHz + 3,170,921,163 instructions # 1.45 insn per cycle + 0.808753069 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.177605e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.178066e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178066e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.289224e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.289695e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.289695e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.263371 sec - 113,104,353,359 cycles # 2.881 GHz - 142,499,000,297 instructions # 1.26 insn per cycle - 39.267518963 seconds time elapsed +TOTAL : 38.241650 sec + 113,019,199,231 cycles # 2.955 GHz + 142,248,565,323 instructions # 1.26 insn per cycle + 38.245708087 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.978578e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.980900e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.980900e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.072453e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.074887e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.074887e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.512347 sec - 14,738,984,303 cycles # 2.672 GHz - 37,383,415,891 instructions # 2.54 insn per cycle - 5.516366576 seconds time elapsed +TOTAL : 5.344087 sec + 14,707,947,905 cycles # 2.751 GHz + 37,382,488,315 instructions # 2.54 insn per cycle + 5.348098543 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:67498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.475575e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.489872e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.489872e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.794112e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.809422e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.809422e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.200089 sec - 5,900,324,656 cycles # 2.678 GHz - 12,761,113,056 instructions # 2.16 insn per cycle - 2.204163616 seconds time elapsed +TOTAL : 2.110550 sec + 5,896,212,315 cycles # 2.790 GHz + 12,761,026,315 instructions # 2.16 insn per cycle + 2.114576128 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45170) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.197126e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.219484e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.219484e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.520990e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.543290e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.543290e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.789159 sec - 4,800,966,323 cycles # 2.679 GHz - 11,387,516,470 instructions # 2.37 insn per cycle - 1.793280010 seconds time elapsed +TOTAL : 1.728655 sec + 4,798,605,535 cycles # 2.771 GHz + 11,387,169,921 instructions # 2.37 insn per cycle + 1.732575457 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:39634) (512y: 220) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.918624e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.931258e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.931258e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.231277e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.245077e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.245077e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.376650 sec - 4,022,990,522 cycles # 1.691 GHz - 5,935,742,762 instructions # 1.48 insn per cycle - 2.380804465 seconds time elapsed +TOTAL : 2.274099 sec + 4,029,131,713 cycles # 1.770 GHz + 5,935,536,536 instructions # 1.47 insn per cycle + 2.278245892 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1962) (512y: 259) (512z:38890) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling index 66df8ea815..3016eb84cc 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:43:39 +DATE: 2025-12-07_18:00:28 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -4.135255e+05 1 256 -5.793061e+05 2 256 -6.367973e+05 4 256 -7.358963e+05 8 256 -7.953962e+05 16 256 -8.026621e+05 32 256 -8.113874e+05 64 256 -8.126232e+05 128 256 -8.151724e+05 256 256 -8.388200e+05 512 256 -8.795025e+05 1024 256 +4.268538e+05 1 256 +5.760745e+05 2 256 +6.378282e+05 4 256 +7.363945e+05 8 256 +7.930300e+05 16 256 +8.092848e+05 32 256 +8.083531e+05 64 256 +8.078476e+05 128 256 +8.128909e+05 256 256 +8.357479e+05 512 256 +8.596279e+05 1024 256 ### GPU: scaling test 32 -5.987397e+04 1 32 -1.082531e+05 2 32 -2.101123e+05 4 32 -2.737883e+05 8 32 -5.126747e+05 16 32 -6.967787e+05 32 32 -7.376223e+05 64 32 -7.871564e+05 128 32 -8.121480e+05 256 32 -8.130411e+05 512 32 -8.134619e+05 1024 32 -8.204307e+05 2048 32 -8.423180e+05 4096 32 -8.883516e+05 8192 32 +5.876822e+04 1 32 +1.177650e+05 2 32 +2.137526e+05 4 32 +2.745258e+05 8 32 +5.237492e+05 16 32 +6.955671e+05 32 32 +7.415687e+05 64 32 +7.861791e+05 128 32 +7.981625e+05 256 32 +8.453694e+05 512 32 +8.147469e+05 1024 32 +8.171314e+05 2048 32 +8.389574e+05 4096 32 +8.965925e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.920624e+03 1 256 -1.925794e+03 2 256 -1.919663e+03 4 256 +1.958031e+03 1 256 +1.928720e+03 2 256 +1.948107e+03 4 256 ### CPU: scaling test 32 -1.889651e+03 1 32 -1.920077e+03 2 32 -1.912129e+03 4 32 +1.888667e+03 1 32 +1.910001e+03 2 32 +1.907509e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.748798e+03 1 256 -6.810960e+03 2 256 -6.802786e+03 4 256 +6.912774e+03 1 256 +6.959142e+03 2 256 +6.989530e+03 4 256 ### CPU: scaling test 32 -6.554707e+03 1 32 -6.688739e+03 2 32 -6.725225e+03 4 32 +6.628762e+03 1 32 +6.744899e+03 2 32 +6.808536e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.524095e+04 1 256 -1.526644e+04 2 256 -1.569761e+04 4 256 +1.564752e+04 1 256 +1.611574e+04 2 256 +1.593920e+04 4 256 ### CPU: scaling test 32 -1.566123e+04 1 32 -1.560506e+04 2 32 -1.523576e+04 4 32 +1.469281e+04 1 32 +1.518064e+04 2 32 +1.552514e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.747918e+04 1 256 -1.758742e+04 2 256 -1.773825e+04 4 256 +1.731519e+04 1 256 +1.791835e+04 2 256 +1.808479e+04 4 256 ### CPU: scaling test 32 -1.691546e+04 1 32 -1.701187e+04 2 32 -1.740175e+04 4 32 +1.728150e+04 1 32 +1.733256e+04 2 32 +1.744966e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.350824e+04 1 256 -1.356994e+04 2 256 -1.370361e+04 4 256 +1.360046e+04 1 256 +1.367494e+04 2 256 +1.396483e+04 4 256 ### CPU: scaling test 32 -1.321355e+04 1 32 -1.322154e+04 2 32 -1.321729e+04 4 32 +1.337072e+04 1 32 +1.329356e+04 2 32 +1.330896e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index edf11bdd4c..950de8f0b6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:26:12 +DATE: 2025-12-07_17:43:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.969754e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.061645e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.069860e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.009223e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.090558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.097271e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.480574 sec - 2,060,773,811 cycles # 2.817 GHz - 2,941,122,949 instructions # 1.43 insn per cycle - 0.791153613 seconds time elapsed +TOTAL : 0.474558 sec + 2,094,520,730 cycles # 2.910 GHz + 2,979,894,156 instructions # 1.42 insn per cycle + 0.776602911 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.903278e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904203e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.956603e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.957551e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.957551e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.622014 sec - 25,008,733,138 cycles # 2.900 GHz - 79,110,262,561 instructions # 3.16 insn per cycle - 8.625952005 seconds time elapsed +TOTAL : 8.387125 sec + 24,949,943,683 cycles # 2.974 GHz + 79,061,639,892 instructions # 3.17 insn per cycle + 8.390886546 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.866781e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.879439e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.879439e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.025042e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.037745e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.037745e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.393369 sec - 6,521,051,461 cycles # 2.721 GHz - 20,285,887,455 instructions # 3.11 insn per cycle - 2.397558323 seconds time elapsed +TOTAL : 2.339827 sec + 6,526,044,399 cycles # 2.785 GHz + 20,285,711,859 instructions # 3.11 insn per cycle + 2.343708680 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.574802e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.581515e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.581515e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.615843e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.622467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.622467e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.046468 sec - 2,851,964,901 cycles # 2.717 GHz - 7,084,391,235 instructions # 2.48 insn per cycle - 1.050530428 seconds time elapsed +TOTAL : 1.019923 sec + 2,852,038,825 cycles # 2.787 GHz + 7,084,520,838 instructions # 2.48 insn per cycle + 1.023798002 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.745784e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.753552e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.753552e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.822333e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.830880e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.830880e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.944326 sec - 2,540,352,407 cycles # 2.681 GHz - 6,429,340,698 instructions # 2.53 insn per cycle - 0.948183906 seconds time elapsed +TOTAL : 0.904614 sec + 2,537,553,580 cycles # 2.796 GHz + 6,429,400,088 instructions # 2.53 insn per cycle + 0.908453117 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.337094e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.341815e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341815e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.428623e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.433939e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.433939e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.231615 sec - 2,100,593,891 cycles # 1.701 GHz - 3,321,026,364 instructions # 1.58 insn per cycle - 1.235667181 seconds time elapsed +TOTAL : 1.153112 sec + 2,099,906,177 cycles # 1.816 GHz + 3,320,710,403 instructions # 1.58 insn per cycle + 1.157174256 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling index ef0c8bca55..fe61caa9dd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:00:32 +DATE: 2025-12-07_18:17:07 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.335389e+05 1 256 -3.586592e+05 2 256 -4.818891e+05 4 256 -5.593817e+05 8 256 -6.056925e+05 16 256 -6.276955e+05 32 256 -6.367619e+05 64 256 -6.473110e+05 128 256 -6.476010e+05 256 256 -6.505009e+05 512 256 -6.687069e+05 1024 256 +2.296302e+05 1 256 +3.628391e+05 2 256 +4.707869e+05 4 256 +5.638458e+05 8 256 +6.079456e+05 16 256 +6.308704e+05 32 256 +6.399076e+05 64 256 +6.474644e+05 128 256 +6.679200e+05 256 256 +6.795328e+05 512 256 +6.730411e+05 1024 256 ### GPU: scaling test 32 -3.216908e+04 1 32 -6.168033e+04 2 32 -1.180476e+05 4 32 -1.918642e+05 8 32 -3.068465e+05 16 32 -4.811781e+05 32 32 -5.662467e+05 64 32 -6.060356e+05 128 32 -6.424836e+05 256 32 -6.336577e+05 512 32 -6.477611e+05 1024 32 -6.516195e+05 2048 32 -6.509793e+05 4096 32 -6.718523e+05 8192 32 +3.475552e+04 1 32 +6.398752e+04 2 32 +1.214578e+05 4 32 +2.023130e+05 8 32 +3.191736e+05 16 32 +5.036500e+05 32 32 +5.726126e+05 64 32 +6.080335e+05 128 32 +6.363234e+05 256 32 +6.393349e+05 512 32 +6.476995e+05 1024 32 +6.553857e+05 2048 32 +6.505865e+05 4096 32 +6.689217e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.906133e+03 1 256 -1.895289e+03 2 256 -1.894897e+03 4 256 +1.950543e+03 1 256 +1.949880e+03 2 256 +1.979369e+03 4 256 ### CPU: scaling test 32 -1.889460e+03 1 32 -1.885630e+03 2 32 -1.887908e+03 4 32 +1.958185e+03 1 32 +1.933485e+03 2 32 +1.934882e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.645424e+03 1 256 -6.741425e+03 2 256 -6.801857e+03 4 256 +6.945753e+03 1 256 +6.911759e+03 2 256 +7.049191e+03 4 256 ### CPU: scaling test 32 -6.523685e+03 1 32 -6.609563e+03 2 32 -6.739293e+03 4 32 +6.751196e+03 1 32 +6.744984e+03 2 32 +6.836889e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.544354e+04 1 256 -1.568938e+04 2 256 -1.565635e+04 4 256 +1.572513e+04 1 256 +1.617838e+04 2 256 +1.592834e+04 4 256 ### CPU: scaling test 32 -1.473739e+04 1 32 -1.556619e+04 2 32 -1.562139e+04 4 32 +1.556360e+04 1 32 +1.557009e+04 2 32 +1.568554e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.746432e+04 1 256 -1.767402e+04 2 256 -1.746961e+04 4 256 +1.757553e+04 1 256 +1.809462e+04 2 256 +1.797139e+04 4 256 ### CPU: scaling test 32 -1.748124e+04 1 32 -1.594924e+04 2 32 -1.708084e+04 4 32 +1.754151e+04 1 32 +1.750266e+04 2 32 +1.734762e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.329941e+04 1 256 -1.349011e+04 2 256 -1.344081e+04 4 256 +1.362580e+04 1 256 +1.403999e+04 2 256 +1.431963e+04 4 256 ### CPU: scaling test 32 -1.333268e+04 1 32 -1.314999e+04 2 32 -1.325747e+04 4 32 +1.329217e+04 1 32 +1.335007e+04 2 32 +1.326821e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt index 701efdbc30..ec91d400db 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:54:02 +DATE: 2025-12-07_18:10:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.311490e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.371404e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.377432e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.317612e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.365225e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.370160e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 1.171779 sec - 4,342,560,419 cycles # 2.834 GHz - 5,966,664,550 instructions # 1.37 insn per cycle - 1.591397840 seconds time elapsed +TOTAL : 1.145636 sec + 4,389,673,693 cycles # 2.922 GHz + 6,055,248,416 instructions # 1.38 insn per cycle + 1.559352549 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.892352e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893287e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893287e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.953257e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.954233e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.954233e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.671691 sec - 25,006,063,904 cycles # 2.883 GHz - 79,110,972,034 instructions # 3.16 insn per cycle - 8.675650420 seconds time elapsed +TOTAL : 8.401288 sec + 24,948,495,360 cycles # 2.969 GHz + 79,060,108,165 instructions # 3.17 insn per cycle + 8.405058063 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.783736e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.796482e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.796482e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.003942e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.016793e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.016793e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.422556 sec - 6,525,728,187 cycles # 2.691 GHz - 20,285,987,046 instructions # 3.11 insn per cycle - 2.426471276 seconds time elapsed +TOTAL : 2.346673 sec + 6,530,961,638 cycles # 2.779 GHz + 20,285,877,175 instructions # 3.11 insn per cycle + 2.350591784 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.560871e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.567340e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.567340e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.612325e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.619053e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.619053e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.055589 sec - 2,850,961,292 cycles # 2.692 GHz - 7,084,449,005 instructions # 2.48 insn per cycle - 1.059632714 seconds time elapsed +TOTAL : 1.021813 sec + 2,852,927,964 cycles # 2.783 GHz + 7,084,480,726 instructions # 2.48 insn per cycle + 1.025658888 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.733304e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.741477e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.741477e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.810907e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.819697e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.819697e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.951122 sec - 2,540,771,004 cycles # 2.663 GHz - 6,429,427,589 instructions # 2.53 insn per cycle - 0.954962814 seconds time elapsed +TOTAL : 0.910575 sec + 2,540,201,223 cycles # 2.780 GHz + 6,429,194,265 instructions # 2.53 insn per cycle + 0.914520942 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.328792e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.333460e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.333460e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.418598e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.423673e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.423673e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.239447 sec - 2,103,191,835 cycles # 1.693 GHz - 3,321,146,945 instructions # 1.58 insn per cycle - 1.243442238 seconds time elapsed +TOTAL : 1.161360 sec + 2,100,960,625 cycles # 1.805 GHz + 3,321,109,554 instructions # 1.58 insn per cycle + 1.165157666 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 33e9172b7c..c7527edf76 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:32:02 +DATE: 2025-12-07_18:51:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.861766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.949922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.949922e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.904866e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.978714e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.978714e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.468518 sec - 2,012,803,026 cycles # 2.822 GHz - 2,875,965,208 instructions # 1.43 insn per cycle - 0.770453877 seconds time elapsed +TOTAL : 0.463886 sec + 2,059,059,125 cycles # 2.891 GHz + 2,952,134,921 instructions # 1.43 insn per cycle + 0.769684136 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.893203e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.894136e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.894136e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.963517e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.964535e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.964535e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.670365 sec - 25,029,663,251 cycles # 2.886 GHz - 79,116,596,499 instructions # 3.16 insn per cycle - 8.674407204 seconds time elapsed +TOTAL : 8.360066 sec + 24,958,211,075 cycles # 2.985 GHz + 79,064,944,041 instructions # 3.17 insn per cycle + 8.364166710 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -113,8 +107,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -122,14 +116,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.709216e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.721522e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.721522e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.042034e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.055478e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.055478e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.452506 sec - 6,536,185,486 cycles # 2.662 GHz - 20,295,453,995 instructions # 3.11 insn per cycle - 2.456555328 seconds time elapsed +TOTAL : 2.336612 sec + 6,531,204,956 cycles # 2.791 GHz + 20,294,718,787 instructions # 3.11 insn per cycle + 2.340639304 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.562296e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.568810e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.568810e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.602709e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.609689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.609689e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.057576 sec - 2,861,881,138 cycles # 2.697 GHz - 7,094,482,774 instructions # 2.48 insn per cycle - 1.061902735 seconds time elapsed +TOTAL : 1.030847 sec + 2,888,576,131 cycles # 2.793 GHz + 7,094,202,147 instructions # 2.46 insn per cycle + 1.034789120 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.759096e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.767108e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.767108e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.816144e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.825127e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.825127e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.940293 sec - 2,550,431,948 cycles # 2.703 GHz - 6,439,393,273 instructions # 2.52 insn per cycle - 0.944425361 seconds time elapsed +TOTAL : 0.910484 sec + 2,548,833,079 cycles # 2.791 GHz + 6,438,935,487 instructions # 2.53 insn per cycle + 0.914502603 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.351978e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.356813e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.356813e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.421653e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.426848e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.426848e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.220874 sec - 2,108,458,958 cycles # 1.722 GHz - 3,331,332,180 instructions # 1.58 insn per cycle - 1.225108686 seconds time elapsed +TOTAL : 1.161248 sec + 2,108,222,232 cycles # 1.810 GHz + 3,331,003,797 instructions # 1.58 insn per cycle + 1.165159975 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 2a484de798..b1351b62de 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:45:41 +DATE: 2025-12-07_19:04:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.975551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.068315e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.076540e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.976367e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.058837e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.065637e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159396e-01 +- 3.238803e-01 ) GeV^-4 -TOTAL : 0.467991 sec - 2,005,858,911 cycles # 2.818 GHz - 2,853,662,043 instructions # 1.42 insn per cycle - 0.770358119 seconds time elapsed +TOTAL : 0.460222 sec + 2,057,189,004 cycles # 2.913 GHz + 2,957,602,738 instructions # 1.44 insn per cycle + 0.763325608 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.892862e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893799e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893799e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.955425e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.956407e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.956407e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.670204 sec - 25,024,619,872 cycles # 2.885 GHz - 79,109,507,524 instructions # 3.16 insn per cycle - 8.674082417 seconds time elapsed +TOTAL : 8.393999 sec + 24,961,188,393 cycles # 2.973 GHz + 79,061,074,802 instructions # 3.17 insn per cycle + 8.397810361 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.794380e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.806787e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.806787e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.996390e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.009776e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.009776e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.419819 sec - 6,522,870,130 cycles # 2.692 GHz - 20,284,313,479 instructions # 3.11 insn per cycle - 2.423616462 seconds time elapsed +TOTAL : 2.349878 sec + 6,540,111,986 cycles # 2.780 GHz + 20,285,643,085 instructions # 3.10 insn per cycle + 2.353763174 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.559254e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565757e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565757e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.615994e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.622815e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.622815e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.057643 sec - 2,858,106,356 cycles # 2.694 GHz - 7,082,027,901 instructions # 2.48 insn per cycle - 1.061594009 seconds time elapsed +TOTAL : 1.021592 sec + 2,858,719,310 cycles # 2.790 GHz + 7,083,724,751 instructions # 2.48 insn per cycle + 1.025199555 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.732036e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.739945e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.739945e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.787452e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795519e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795519e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.953431 sec - 2,543,753,776 cycles # 2.660 GHz - 6,427,635,361 instructions # 2.53 insn per cycle - 0.957126756 seconds time elapsed +TOTAL : 0.924137 sec + 2,546,676,753 cycles # 2.747 GHz + 6,427,755,011 instructions # 2.52 insn per cycle + 0.927975892 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.349101e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.354028e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.354028e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.419180e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.424373e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.424373e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.221899 sec - 2,101,668,726 cycles # 1.716 GHz - 3,317,393,025 instructions # 1.58 insn per cycle - 1.225868499 seconds time elapsed +TOTAL : 1.161870 sec + 2,102,387,378 cycles # 1.805 GHz + 3,317,244,959 instructions # 1.58 insn per cycle + 1.165582474 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 9f5f8217b1..43fbfb3c5b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:42:10 +DATE: 2025-12-07_19:01:31 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.971986e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.070136e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.083717e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.002574e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.089018e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.096279e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.465911 sec - 2,085,649,672 cycles # 2.824 GHz - 2,853,158,366 instructions # 1.37 insn per cycle - 0.797926486 seconds time elapsed +TOTAL : 0.458854 sec + 2,055,903,812 cycles # 2.919 GHz + 2,921,151,008 instructions # 1.42 insn per cycle + 0.761487234 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.887385e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.888309e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.888309e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.947118e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.948110e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.948110e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.694438 sec - 25,009,094,589 cycles # 2.876 GHz - 79,110,682,076 instructions # 3.16 insn per cycle - 8.698358258 seconds time elapsed +TOTAL : 8.428118 sec + 24,962,194,000 cycles # 2.960 GHz + 79,059,983,860 instructions # 3.17 insn per cycle + 8.432965670 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.786091e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.798676e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.798676e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.974320e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.986827e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.986827e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.421571 sec - 6,521,561,343 cycles # 2.690 GHz - 20,285,907,872 instructions # 3.11 insn per cycle - 2.425622228 seconds time elapsed +TOTAL : 2.356689 sec + 6,521,141,621 cycles # 2.764 GHz + 20,285,728,016 instructions # 3.11 insn per cycle + 2.360438859 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.544765e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.551053e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.551053e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.614975e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.621532e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621532e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.066479 sec - 2,853,976,312 cycles # 2.668 GHz - 7,084,427,661 instructions # 2.48 insn per cycle - 1.070436318 seconds time elapsed +TOTAL : 1.020380 sec + 2,853,836,071 cycles # 2.789 GHz + 7,084,767,173 instructions # 2.48 insn per cycle + 1.024120922 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.733440e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.741292e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.741292e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.814116e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.822488e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.822488e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.951193 sec - 2,545,293,522 cycles # 2.667 GHz - 6,429,326,530 instructions # 2.53 insn per cycle - 0.955037744 seconds time elapsed +TOTAL : 0.908595 sec + 2,538,310,032 cycles # 2.784 GHz + 6,429,288,914 instructions # 2.53 insn per cycle + 0.912407150 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.345267e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.349883e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.349883e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.427892e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.433206e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.433206e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.224208 sec - 2,101,816,780 cycles # 1.713 GHz - 3,321,301,841 instructions # 1.58 insn per cycle - 1.228087953 seconds time elapsed +TOTAL : 1.153603 sec + 2,098,238,709 cycles # 1.814 GHz + 3,321,018,195 instructions # 1.58 insn per cycle + 1.157445110 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt index 30c823393b..1eea7f1e7c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:51:59 +DATE: 2025-12-07_19:18:10 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.013258e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.103080e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.110808e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.949834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.036977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.043975e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.479902 sec - 1,978,219,521 cycles # 2.831 GHz - 2,863,905,705 instructions # 1.45 insn per cycle - 0.755864012 seconds time elapsed +TOTAL : 0.481573 sec + 2,021,196,737 cycles # 2.888 GHz + 2,929,813,756 instructions # 1.45 insn per cycle + 0.757555929 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.898659e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.899570e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.899570e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.966499e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.967467e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.967467e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.643023 sec - 24,998,550,241 cycles # 2.892 GHz - 79,111,084,095 instructions # 3.16 insn per cycle - 8.646984489 seconds time elapsed +TOTAL : 8.345159 sec + 24,941,976,672 cycles # 2.988 GHz + 79,059,997,478 instructions # 3.17 insn per cycle + 8.349011948 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.719385e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.731327e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.731327e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.950815e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.963179e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.963179e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.445830 sec - 6,526,769,240 cycles # 2.665 GHz - 20,286,103,115 instructions # 3.11 insn per cycle - 2.449754025 seconds time elapsed +TOTAL : 2.364240 sec + 6,547,202,252 cycles # 2.766 GHz + 20,285,604,493 instructions # 3.10 insn per cycle + 2.368103024 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.565963e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.572237e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.572237e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.616615e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.623249e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623249e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.052461 sec - 2,851,588,130 cycles # 2.701 GHz - 7,084,479,012 instructions # 2.48 insn per cycle - 1.056444800 seconds time elapsed +TOTAL : 1.019064 sec + 2,850,365,861 cycles # 2.788 GHz + 7,084,339,822 instructions # 2.49 insn per cycle + 1.022926464 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.748496e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.756542e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.756542e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.795937e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.804533e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804533e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.942761 sec - 2,539,647,091 cycles # 2.684 GHz - 6,429,491,013 instructions # 2.53 insn per cycle - 0.946755867 seconds time elapsed +TOTAL : 0.917945 sec + 2,540,216,622 cycles # 2.758 GHz + 6,429,572,577 instructions # 2.53 insn per cycle + 0.921924415 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.348567e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.353355e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.353355e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.420204e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.425511e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.425511e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.221456 sec - 2,102,747,652 cycles # 1.717 GHz - 3,321,271,092 instructions # 1.58 insn per cycle - 1.225405100 seconds time elapsed +TOTAL : 1.159830 sec + 2,108,738,068 cycles # 1.813 GHz + 3,320,804,549 instructions # 1.57 insn per cycle + 1.163865380 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index b51802abeb..f4be6f611c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:38:43 +DATE: 2025-12-07_18:58:11 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.083410e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.111715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.119810e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.058044e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.076009e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.083008e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.467709 sec - 2,010,523,047 cycles # 2.824 GHz - 2,892,361,831 instructions # 1.44 insn per cycle - 0.770628946 seconds time elapsed +TOTAL : 0.462194 sec + 2,060,068,545 cycles # 2.871 GHz + 2,922,560,520 instructions # 1.42 insn per cycle + 0.774665990 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.889714e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890621e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890621e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.953891e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.954840e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.954840e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.683941 sec - 25,012,693,300 cycles # 2.880 GHz - 79,111,053,402 instructions # 3.16 insn per cycle - 8.687777898 seconds time elapsed +TOTAL : 8.398616 sec + 24,984,841,700 cycles # 2.974 GHz + 79,060,726,611 instructions # 3.16 insn per cycle + 8.402481796 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -110,8 +104,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -119,14 +113,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.774197e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.786532e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.786532e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.889035e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.901286e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.901286e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.425829 sec - 6,538,669,629 cycles # 2.692 GHz - 20,286,236,268 instructions # 3.10 insn per cycle - 2.429903422 seconds time elapsed +TOTAL : 2.385448 sec + 6,530,876,571 cycles # 2.735 GHz + 20,287,833,266 instructions # 3.11 insn per cycle + 2.389264437 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.538774e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.544893e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.544893e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.596382e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.602892e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.602892e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.071044 sec - 2,851,268,280 cycles # 2.654 GHz - 7,084,649,438 instructions # 2.48 insn per cycle - 1.074854505 seconds time elapsed +TOTAL : 1.032125 sec + 2,853,083,492 cycles # 2.756 GHz + 7,084,347,023 instructions # 2.48 insn per cycle + 1.035968952 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.734960e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.742729e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.742729e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.782956e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.791756e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.791756e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.950344 sec - 2,540,286,423 cycles # 2.664 GHz - 6,429,424,927 instructions # 2.53 insn per cycle - 0.954335905 seconds time elapsed +TOTAL : 0.924402 sec + 2,543,989,712 cycles # 2.743 GHz + 6,429,246,779 instructions # 2.53 insn per cycle + 0.928156100 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.326881e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.331538e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.331538e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.420718e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.425821e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.425821e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.241226 sec - 2,102,177,412 cycles # 1.689 GHz - 3,321,695,580 instructions # 1.58 insn per cycle - 1.245320786 seconds time elapsed +TOTAL : 1.159375 sec + 2,098,982,194 cycles # 1.806 GHz + 3,320,751,929 instructions # 1.58 insn per cycle + 1.163173616 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index a1ed0e1048..d6fe2c2774 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:26:49 +DATE: 2025-12-07_17:44:07 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.023167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.101141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.108760e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.028073e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.104148e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.111543e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.481972 sec - 2,053,644,686 cycles # 2.818 GHz - 2,906,367,138 instructions # 1.42 insn per cycle - 0.790666270 seconds time elapsed +TOTAL : 0.475943 sec + 2,079,983,769 cycles # 2.868 GHz + 2,938,112,986 instructions # 1.41 insn per cycle + 0.783091623 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.911966e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.912904e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912904e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.967253e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.968224e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.968224e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.582602 sec - 24,849,332,204 cycles # 2.895 GHz - 78,811,199,944 instructions # 3.17 insn per cycle - 8.586531797 seconds time elapsed +TOTAL : 8.341629 sec + 24,845,085,371 cycles # 2.978 GHz + 78,761,784,554 instructions # 3.17 insn per cycle + 8.345391919 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2999) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863279149748E-004 -Relative difference = 4.947803358686673e-08 +Avg ME (F77/C++) = 6.6274863280412627E-004 +Relative difference = 4.949708875636104e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.802565e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.815087e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.815087e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.102757e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.115662e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.115662e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.415633 sec - 6,482,490,857 cycles # 2.680 GHz - 20,247,828,097 instructions # 3.12 insn per cycle - 2.419608944 seconds time elapsed +TOTAL : 2.313817 sec + 6,469,159,192 cycles # 2.793 GHz + 20,247,347,776 instructions # 3.13 insn per cycle + 2.317660650 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.493020e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.499074e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.499074e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.542827e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.548921e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548921e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.103256 sec - 2,994,004,582 cycles # 2.706 GHz - 7,224,670,986 instructions # 2.41 insn per cycle - 1.107361000 seconds time elapsed +TOTAL : 1.067585 sec + 2,988,528,375 cycles # 2.792 GHz + 7,224,687,437 instructions # 2.42 insn per cycle + 1.071409199 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12455) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.703839e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.711671e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.711671e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.755115e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.762972e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.762972e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.967356 sec - 2,634,233,834 cycles # 2.714 GHz - 6,565,459,296 instructions # 2.49 insn per cycle - 0.971230309 seconds time elapsed +TOTAL : 0.939211 sec + 2,633,100,957 cycles # 2.795 GHz + 6,565,445,624 instructions # 2.49 insn per cycle + 0.942991829 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11486) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.318889e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.323344e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.323344e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.379331e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.384307e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.384307e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.248532 sec - 2,165,605,341 cycles # 1.730 GHz - 3,476,565,175 instructions # 1.61 insn per cycle - 1.252574898 seconds time elapsed +TOTAL : 1.194088 sec + 2,163,918,592 cycles # 1.807 GHz + 3,476,395,121 instructions # 1.61 insn per cycle + 1.198135615 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3018) (512y: 20) (512z: 9665) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index c3e94ba26d..dcdfe51c20 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:22:45 +DATE: 2025-12-07_18:43:04 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.980018e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.060840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.068475e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.929027e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.029798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.037161e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.483472 sec - 2,078,701,556 cycles # 2.836 GHz - 2,938,258,784 instructions # 1.41 insn per cycle - 0.794272127 seconds time elapsed +TOTAL : 0.479365 sec + 2,102,475,486 cycles # 2.889 GHz + 3,002,183,547 instructions # 1.43 insn per cycle + 0.787745488 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.536396e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.537181e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.537181e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.671148e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.671980e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.671980e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.627851 sec - 85,239,542,827 cycles # 2.877 GHz - 134,215,968,109 instructions # 1.57 insn per cycle - 29.631730646 seconds time elapsed +TOTAL : 28.923656 sec + 85,892,844,113 cycles # 2.970 GHz + 134,163,555,126 instructions # 1.56 insn per cycle + 28.927503566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:15099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349049735310E-004 -Relative difference = 1.4338131648076968e-08 +Avg ME (F77/C++) = 6.6275349003305783E-004 +Relative difference = 1.5038686634053265e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.562878e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.574411e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.574411e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.808140e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.820064e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.820064e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.504142 sec - 6,771,535,920 cycles # 2.701 GHz - 19,207,882,725 instructions # 2.84 insn per cycle - 2.508192424 seconds time elapsed +TOTAL : 2.413802 sec + 6,764,112,136 cycles # 2.799 GHz + 19,207,459,286 instructions # 2.84 insn per cycle + 2.417779102 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:68781) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.450780e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.456226e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.456226e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.490916e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.496573e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.496573e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.135519 sec - 3,073,910,834 cycles # 2.700 GHz - 6,671,130,394 instructions # 2.17 insn per cycle - 1.139479935 seconds time elapsed +TOTAL : 1.105158 sec + 3,089,498,326 cycles # 2.788 GHz + 6,671,148,747 instructions # 2.16 insn per cycle + 1.109092943 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47844) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.771981e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.780020e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.780020e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.848917e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.857821e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.857821e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.930511 sec - 2,525,041,206 cycles # 2.704 GHz - 5,950,807,908 instructions # 2.36 insn per cycle - 0.934389144 seconds time elapsed +TOTAL : 0.891741 sec + 2,500,206,839 cycles # 2.794 GHz + 5,951,043,541 instructions # 2.38 insn per cycle + 0.895468834 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42169) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.326409e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.331048e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.331048e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.416787e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.421840e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.421840e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.241611 sec - 2,116,308,082 cycles # 1.700 GHz - 3,522,579,874 instructions # 1.66 insn per cycle - 1.245792482 seconds time elapsed +TOTAL : 1.162656 sec + 2,098,372,809 cycles # 1.800 GHz + 3,522,306,560 instructions # 1.68 insn per cycle + 1.166507967 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5213) (512y: 3) (512z:44839) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 0bef615dd8..709872e8d7 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:23:46 +DATE: 2025-12-07_18:44:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.071174e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.149873e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.157266e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.051965e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.137518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.145207e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.480187 sec - 2,056,422,141 cycles # 2.821 GHz - 2,909,868,255 instructions # 1.42 insn per cycle - 0.789769149 seconds time elapsed +TOTAL : 0.478432 sec + 2,109,852,076 cycles # 2.912 GHz + 3,003,170,596 instructions # 1.42 insn per cycle + 0.786290391 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.550689e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.551508e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.551508e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.696247e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.697090e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.697090e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.550873 sec - 85,210,035,482 cycles # 2.883 GHz - 134,053,525,503 instructions # 1.57 insn per cycle - 29.554932127 seconds time elapsed +TOTAL : 28.796673 sec + 85,609,391,108 cycles # 2.973 GHz + 134,000,412,758 instructions # 1.57 insn per cycle + 28.800679620 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:15171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349729240374E-004 -Relative difference = 4.085374577342176e-09 +Avg ME (F77/C++) = 6.6275349728753263E-004 +Relative difference = 4.0927243740924655e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.704049e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.715826e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.715826e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.989362e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.001987e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.001987e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.451563 sec - 6,575,110,645 cycles # 2.679 GHz - 19,101,194,250 instructions # 2.91 insn per cycle - 2.455617178 seconds time elapsed +TOTAL : 2.351611 sec + 6,566,797,613 cycles # 2.789 GHz + 19,100,899,857 instructions # 2.91 insn per cycle + 2.355465395 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:68204) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.461044e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.466509e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.466509e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.510070e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.515932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.515932e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.127472 sec - 3,056,173,108 cycles # 2.702 GHz - 6,654,226,606 instructions # 2.18 insn per cycle - 1.131533762 seconds time elapsed +TOTAL : 1.091015 sec + 3,052,717,269 cycles # 2.790 GHz + 6,654,298,695 instructions # 2.18 insn per cycle + 1.094968238 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47010) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.769806e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.777757e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.777757e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.814318e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.822896e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.822896e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.931579 sec - 2,522,992,718 cycles # 2.700 GHz - 5,975,076,879 instructions # 2.37 insn per cycle - 0.935429613 seconds time elapsed +TOTAL : 0.909034 sec + 2,522,325,941 cycles # 2.766 GHz + 5,975,152,301 instructions # 2.37 insn per cycle + 0.912842387 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41660) (512y: 11) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.345570e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.350413e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.350413e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.395808e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.400658e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.400658e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.223621 sec - 2,097,428,008 cycles # 1.710 GHz - 3,514,537,932 instructions # 1.68 insn per cycle - 1.227733047 seconds time elapsed +TOTAL : 1.180152 sec + 2,097,547,243 cycles # 1.773 GHz + 3,514,375,733 instructions # 1.68 insn per cycle + 1.184183449 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4173) (512y: 4) (512z:44470) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling index 10d80cdca4..f879c173e8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:43:12 +DATE: 2025-12-07_18:00:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.858419e+05 1 256 -3.745329e+05 2 256 -3.897177e+05 4 256 -4.239569e+05 8 256 -4.437166e+05 16 256 -4.444009e+05 32 256 -4.485074e+05 64 256 -4.433314e+05 128 256 -4.512938e+05 256 256 -4.568500e+05 512 256 -4.555629e+05 1024 256 +3.033272e+05 1 256 +3.707193e+05 2 256 +3.891181e+05 4 256 +4.166580e+05 8 256 +4.422509e+05 16 256 +4.446368e+05 32 256 +4.460016e+05 64 256 +4.425018e+05 128 256 +4.541688e+05 256 256 +4.587863e+05 512 256 +4.560556e+05 1024 256 ### GPU: scaling test 32 -5.657558e+04 1 32 -1.070333e+05 2 32 -1.849532e+05 4 32 -2.657280e+05 8 32 -3.949685e+05 16 32 -3.946154e+05 32 32 -4.350193e+05 64 32 -4.473966e+05 128 32 -4.519860e+05 256 32 -4.459799e+05 512 32 -4.463425e+05 1024 32 -4.512453e+05 2048 32 -4.596972e+05 4096 32 -4.567015e+05 8192 32 +5.851204e+04 1 32 +1.108146e+05 2 32 +1.865237e+05 4 32 +2.640692e+05 8 32 +3.859613e+05 16 32 +3.998505e+05 32 32 +4.303968e+05 64 32 +4.449343e+05 128 32 +4.483729e+05 256 32 +4.455086e+05 512 32 +4.435749e+05 1024 32 +4.506847e+05 2048 32 +4.617559e+05 4096 32 +4.571041e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.832892e+03 1 256 -1.824058e+03 2 256 -1.836696e+03 4 256 +1.848855e+03 1 256 +1.843813e+03 2 256 +1.845676e+03 4 256 ### CPU: scaling test 32 -1.828347e+03 1 32 -1.832242e+03 2 32 -1.831046e+03 4 32 +1.833217e+03 1 32 +1.833594e+03 2 32 +1.845431e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.486552e+03 1 256 -3.490138e+03 2 256 -3.498447e+03 4 256 +3.435565e+03 1 256 +3.408800e+03 2 256 +3.394142e+03 4 256 ### CPU: scaling test 32 -3.349673e+03 1 32 -3.424966e+03 2 32 -3.419275e+03 4 32 +3.282554e+03 1 32 +3.293937e+03 2 32 +3.348372e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.965219e+03 1 256 -7.977523e+03 2 256 -8.081277e+03 4 256 +7.795739e+03 1 256 +7.762043e+03 2 256 +7.862976e+03 4 256 ### CPU: scaling test 32 -7.768804e+03 1 32 -7.471564e+03 2 32 -7.954694e+03 4 32 +7.208352e+03 1 32 +7.435286e+03 2 32 +7.511458e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.159079e+03 1 256 -9.181848e+03 2 256 -9.256886e+03 4 256 +8.950971e+03 1 256 +8.861965e+03 2 256 +8.999994e+03 4 256 ### CPU: scaling test 32 -8.945974e+03 1 32 -8.898384e+03 2 32 -8.978221e+03 4 32 +8.282253e+03 1 32 +8.292454e+03 2 32 +8.534814e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.830723e+03 1 256 -6.905755e+03 2 256 -6.932432e+03 4 256 +6.808311e+03 1 256 +6.823008e+03 2 256 +6.771836e+03 4 256 ### CPU: scaling test 32 -6.653413e+03 1 32 -6.716747e+03 2 32 -6.760196e+03 4 32 +6.705851e+03 1 32 +6.745220e+03 2 32 +6.792038e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index e3e2b43997..d02d93e3b8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:24:46 +DATE: 2025-12-07_17:42:08 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.393156e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.441810e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.445057e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.412190e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.454186e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.457014e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.502434 sec - 2,151,870,507 cycles # 2.842 GHz - 3,130,235,445 instructions # 1.45 insn per cycle - 0.824960007 seconds time elapsed +TOTAL : 0.497755 sec + 2,181,307,250 cycles # 2.911 GHz + 3,205,591,870 instructions # 1.47 insn per cycle + 0.811418885 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.825164e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.826053e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.826053e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.910476e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.911438e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.911438e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.992021 sec - 26,029,577,464 cycles # 2.894 GHz - 79,114,128,675 instructions # 3.04 insn per cycle - 8.996124488 seconds time elapsed +TOTAL : 8.591050 sec + 25,631,743,957 cycles # 2.983 GHz + 78,359,817,221 instructions # 3.06 insn per cycle + 8.594967237 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733897710922E-004 +Relative difference = 2.4299198431742123e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.429291e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.432449e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.432449e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.588165e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.591523e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.591523e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.789072 sec - 12,824,725,318 cycles # 2.676 GHz - 38,757,792,368 instructions # 3.02 insn per cycle - 4.793199776 seconds time elapsed +TOTAL : 4.576643 sec + 12,802,251,788 cycles # 2.796 GHz + 38,731,973,234 instructions # 3.03 insn per cycle + 4.580623377 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.935628e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.953025e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.953025e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.285972e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.303571e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.303571e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.072950 sec - 5,562,263,841 cycles # 2.679 GHz - 13,540,518,730 instructions # 2.43 insn per cycle - 2.077092697 seconds time elapsed +TOTAL : 1.985598 sec + 5,551,173,792 cycles # 2.791 GHz + 13,503,461,466 instructions # 2.43 insn per cycle + 1.989569366 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.986204e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.007643e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.007643e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.402547e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.425459e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.425459e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.831318 sec - 4,854,515,630 cycles # 2.646 GHz - 12,237,415,635 instructions # 2.52 insn per cycle - 1.835524858 seconds time elapsed +TOTAL : 1.750427 sec + 4,869,029,521 cycles # 2.777 GHz + 12,201,762,641 instructions # 2.51 insn per cycle + 1.754502394 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.899014e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.911241e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.911241e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.245390e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.258120e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.258120e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.383753 sec - 4,111,562,734 cycles # 1.722 GHz - 6,282,557,303 instructions # 1.53 insn per cycle - 2.388073448 seconds time elapsed +TOTAL : 2.270076 sec + 4,098,896,035 cycles # 1.803 GHz + 6,259,253,577 instructions # 1.53 insn per cycle + 2.274088843 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling index 5eb0658f4e..cd9fa98742 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:59:44 +DATE: 2025-12-07_18:16:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.478169e+05 1 256 -2.269338e+05 2 256 -2.908405e+05 4 256 -3.460040e+05 8 256 -3.706753e+05 16 256 -3.850253e+05 32 256 -3.834285e+05 64 256 -3.887436e+05 128 256 -3.877878e+05 256 256 -3.930166e+05 512 256 -4.044746e+05 1024 256 +1.529752e+05 1 256 +2.279290e+05 2 256 +2.958876e+05 4 256 +3.463165e+05 8 256 +3.752952e+05 16 256 +3.838083e+05 32 256 +3.851068e+05 64 256 +3.839420e+05 128 256 +3.942627e+05 256 256 +3.926422e+05 512 256 +4.039633e+05 1024 256 ### GPU: scaling test 32 -2.315019e+04 1 32 -4.199167e+04 2 32 -8.231040e+04 4 32 -1.430769e+05 8 32 -2.353840e+05 16 32 -2.941154e+05 32 32 -3.501493e+05 64 32 -3.762161e+05 128 32 -3.849858e+05 256 32 -3.843601e+05 512 32 -3.882366e+05 1024 32 -3.853348e+05 2048 32 -3.939954e+05 4096 32 -4.042764e+05 8192 32 +2.331556e+04 1 32 +4.458719e+04 2 32 +8.619389e+04 4 32 +1.503990e+05 8 32 +2.380508e+05 16 32 +2.937799e+05 32 32 +3.501627e+05 64 32 +3.745244e+05 128 32 +3.856041e+05 256 32 +3.868839e+05 512 32 +3.940869e+05 1024 32 +3.946434e+05 2048 32 +3.911312e+05 4096 32 +4.019417e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.820929e+03 1 256 -1.819554e+03 2 256 -1.824693e+03 4 256 +1.920267e+03 1 256 +1.927274e+03 2 256 +1.929800e+03 4 256 ### CPU: scaling test 32 -1.809922e+03 1 32 -1.818380e+03 2 32 -1.829598e+03 4 32 +1.866080e+03 1 32 +1.896383e+03 2 32 +1.882472e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.467484e+03 1 256 -3.477201e+03 2 256 -3.483666e+03 4 256 +3.616096e+03 1 256 +3.629690e+03 2 256 +3.596360e+03 4 256 ### CPU: scaling test 32 -3.376210e+03 1 32 -3.385787e+03 2 32 -3.462870e+03 4 32 +3.409597e+03 1 32 +3.542662e+03 2 32 +3.571818e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.773756e+03 1 256 -7.868538e+03 2 256 -7.891583e+03 4 256 +8.123121e+03 1 256 +8.180386e+03 2 256 +8.316600e+03 4 256 ### CPU: scaling test 32 -7.767594e+03 1 32 -7.512875e+03 2 32 -7.861406e+03 4 32 +7.885918e+03 1 32 +7.991995e+03 2 32 +7.595516e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.905874e+03 1 256 -9.000800e+03 2 256 -9.159354e+03 4 256 +9.287468e+03 1 256 +9.329861e+03 2 256 +9.506600e+03 4 256 ### CPU: scaling test 32 -9.007891e+03 1 32 -8.853559e+03 2 32 -8.999340e+03 4 32 +9.095224e+03 1 32 +8.933194e+03 2 32 +9.097781e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.725095e+03 1 256 -6.926689e+03 2 256 -6.793100e+03 4 256 +7.119814e+03 1 256 +7.189912e+03 2 256 +7.259316e+03 4 256 ### CPU: scaling test 32 -6.759773e+03 1 32 -6.705987e+03 2 32 -6.758642e+03 4 32 +6.713136e+03 1 32 +6.939876e+03 2 32 +6.880373e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt index 8b06b13019..78fe0a7c40 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:53:12 +DATE: 2025-12-07_18:09:57 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.813357e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.847839e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.850325e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.803501e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.833998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837394e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.193508 sec - 4,401,135,195 cycles # 2.829 GHz - 6,108,788,422 instructions # 1.39 insn per cycle - 1.613268691 seconds time elapsed +TOTAL : 1.172166 sec + 4,469,354,340 cycles # 2.918 GHz + 6,161,301,053 instructions # 1.38 insn per cycle + 1.592362073 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.815440e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.816305e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.816305e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.903103e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904047e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904047e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.040328 sec - 26,031,336,563 cycles # 2.879 GHz - 79,117,154,926 instructions # 3.04 insn per cycle - 9.044442399 seconds time elapsed +TOTAL : 8.624141 sec + 25,610,646,944 cycles # 2.969 GHz + 78,359,903,560 instructions # 3.06 insn per cycle + 8.628149489 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733897710922E-004 +Relative difference = 2.4299198431742123e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.427905e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.431039e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.431039e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.545044e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.548349e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548349e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.790651 sec - 12,832,687,294 cycles # 2.677 GHz - 38,758,106,395 instructions # 3.02 insn per cycle - 4.794734568 seconds time elapsed +TOTAL : 4.632583 sec + 12,817,242,959 cycles # 2.765 GHz + 38,731,650,829 instructions # 3.02 insn per cycle + 4.636462804 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.935202e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.951558e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.951558e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.270582e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.287354e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.287354e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.072958 sec - 5,568,085,348 cycles # 2.682 GHz - 13,540,506,751 instructions # 2.43 insn per cycle - 2.076971724 seconds time elapsed +TOTAL : 1.989008 sec + 5,550,748,494 cycles # 2.787 GHz + 13,504,722,841 instructions # 2.43 insn per cycle + 1.992814081 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.161412e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.183655e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.183655e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.422545e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.445131e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.445131e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.796303 sec - 4,854,337,043 cycles # 2.698 GHz - 12,237,142,563 instructions # 2.52 insn per cycle - 1.800481736 seconds time elapsed +TOTAL : 1.747059 sec + 4,864,475,985 cycles # 2.779 GHz + 12,200,329,692 instructions # 2.51 insn per cycle + 1.750940726 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.873484e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.885441e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.885441e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.232352e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.245293e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.245293e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.392508 sec - 4,106,170,622 cycles # 1.714 GHz - 6,282,499,145 instructions # 1.53 insn per cycle - 2.396728116 seconds time elapsed +TOTAL : 2.274108 sec + 4,084,136,471 cycles # 1.794 GHz + 6,259,213,945 instructions # 1.53 insn per cycle + 2.277966276 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt index 1a693ccc02..845b04a5d6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:51:16 +DATE: 2025-12-07_19:17:29 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.425282e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.474579e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.477977e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.407973e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.452544e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.455533e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.505604 sec - 2,079,342,335 cycles # 2.823 GHz - 3,110,113,358 instructions # 1.50 insn per cycle - 0.804143585 seconds time elapsed +TOTAL : 0.495567 sec + 2,103,155,472 cycles # 2.904 GHz + 3,161,009,673 instructions # 1.50 insn per cycle + 0.783663948 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.820544e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.821419e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.821419e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.903957e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904922e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904922e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.014922 sec - 26,029,815,792 cycles # 2.887 GHz - 79,113,148,007 instructions # 3.04 insn per cycle - 9.018853711 seconds time elapsed +TOTAL : 8.620281 sec + 25,615,999,913 cycles # 2.971 GHz + 78,360,081,826 instructions # 3.06 insn per cycle + 8.624240909 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733897710922E-004 +Relative difference = 2.4299198431742123e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.422911e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.426145e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.426145e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.567656e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.570931e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.570931e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.797700 sec - 12,826,872,860 cycles # 2.672 GHz - 38,756,601,713 instructions # 3.02 insn per cycle - 4.801871860 seconds time elapsed +TOTAL : 4.603331 sec + 12,813,125,125 cycles # 2.782 GHz + 38,731,732,397 instructions # 3.02 insn per cycle + 4.607478443 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.944046e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.960023e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.960023e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.228090e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.245686e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.245686e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.070707 sec - 5,566,396,722 cycles # 2.684 GHz - 13,540,340,017 instructions # 2.43 insn per cycle - 2.074804703 seconds time elapsed +TOTAL : 1.999443 sec + 5,548,322,448 cycles # 2.771 GHz + 13,503,603,137 instructions # 2.43 insn per cycle + 2.003472729 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.072103e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.093961e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.093961e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.250710e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.273292e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.273292e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.814093 sec - 4,852,758,403 cycles # 2.670 GHz - 12,237,059,875 instructions # 2.52 insn per cycle - 1.818055824 seconds time elapsed +TOTAL : 1.779524 sec + 4,860,656,125 cycles # 2.726 GHz + 12,200,726,069 instructions # 2.51 insn per cycle + 1.783868025 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.846048e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.858465e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.858465e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.268512e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.281915e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.281915e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.401888 sec - 4,113,800,876 cycles # 1.711 GHz - 6,282,877,511 instructions # 1.53 insn per cycle - 2.405935799 seconds time elapsed +TOTAL : 2.262615 sec + 4,100,410,296 cycles # 1.810 GHz + 6,259,057,630 instructions # 1.53 insn per cycle + 2.266595149 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 55816a282e..33ecf66852 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:25:29 +DATE: 2025-12-07_17:42:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.409960e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.457193e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.460417e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.442861e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.483784e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.486753e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.500032 sec - 2,128,939,464 cycles # 2.818 GHz - 3,048,895,103 instructions # 1.43 insn per cycle - 0.815266921 seconds time elapsed +TOTAL : 0.496951 sec + 2,176,572,021 cycles # 2.908 GHz + 3,156,100,739 instructions # 1.45 insn per cycle + 0.810324850 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.835004e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.835894e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.835894e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.920962e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.921930e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.921930e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.943891 sec - 25,955,962,699 cycles # 2.901 GHz - 79,198,038,648 instructions # 3.05 insn per cycle - 8.947961266 seconds time elapsed +TOTAL : 8.543607 sec + 25,551,315,087 cycles # 2.990 GHz + 78,445,468,835 instructions # 3.07 insn per cycle + 8.547533743 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733897710922E-004 +Relative difference = 2.4299198431742123e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.464500e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.467677e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.467677e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.513978e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.517095e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.517095e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.740131 sec - 12,742,308,756 cycles # 2.686 GHz - 38,685,964,134 instructions # 3.04 insn per cycle - 4.744223175 seconds time elapsed +TOTAL : 4.673363 sec + 12,721,786,752 cycles # 2.720 GHz + 38,661,629,021 instructions # 3.04 insn per cycle + 4.677367975 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:12933) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.985627e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.001632e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.001632e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.268215e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.285883e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.285883e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.059737 sec - 5,594,595,243 cycles # 2.712 GHz - 13,643,577,301 instructions # 2.44 insn per cycle - 2.063806863 seconds time elapsed +TOTAL : 1.989374 sec + 5,564,172,875 cycles # 2.792 GHz + 13,606,469,232 instructions # 2.45 insn per cycle + 1.993437958 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11479) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.864560e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.884766e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.884766e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.310549e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.333133e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.333133e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.855976 sec - 5,031,540,017 cycles # 2.706 GHz - 12,343,462,839 instructions # 2.45 insn per cycle - 1.860103785 seconds time elapsed +TOTAL : 1.767531 sec + 4,897,230,565 cycles # 2.766 GHz + 12,306,690,764 instructions # 2.51 insn per cycle + 1.771564380 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10307) (512y: 226) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.836346e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.848432e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.848432e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.220621e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.234515e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.234515e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.405420 sec - 4,109,302,173 cycles # 1.706 GHz - 6,383,895,140 instructions # 1.55 insn per cycle - 2.409513085 seconds time elapsed +TOTAL : 2.278109 sec + 4,087,314,098 cycles # 1.792 GHz + 6,360,920,750 instructions # 1.56 insn per cycle + 2.282066462 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1734) (512y: 178) (512z: 9357) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling index f43e214106..c78dfa4433 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:45:06 +DATE: 2025-12-07_18:01:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -34,30 +28,30 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.314898e+04 1 256 -1.332401e+04 2 256 -1.369745e+04 4 256 -1.359022e+04 8 256 -1.360893e+04 16 256 -1.354758e+04 32 256 -1.335068e+04 64 256 -1.340355e+04 128 256 -1.338225e+04 256 256 +1.320375e+04 1 256 +1.336620e+04 2 256 +1.375014e+04 4 256 +1.351478e+04 8 256 +1.366076e+04 16 256 +1.351374e+04 32 256 +1.331573e+04 64 256 +1.337758e+04 128 256 +1.336544e+04 256 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -6.222590e+03 1 32 -1.054070e+04 2 32 -1.256578e+04 4 32 -1.334543e+04 8 32 -1.351998e+04 16 32 -1.363026e+04 32 32 -1.353031e+04 64 32 -1.331302e+04 128 32 -1.311792e+04 256 32 -1.318049e+04 512 32 -1.308983e+04 1024 32 -1.314766e+04 2048 32 +6.405321e+03 1 32 +1.094245e+04 2 32 +1.300303e+04 4 32 +1.349553e+04 8 32 +1.361050e+04 16 32 +1.353855e+04 32 32 +1.359083e+04 64 32 +1.356763e+04 128 32 +1.310218e+04 256 32 +1.309140e+04 512 32 +1.316742e+04 1024 32 +1.310345e+04 2048 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.572551e+01 1 256 -7.477397e+01 2 256 -7.590781e+01 4 256 +7.847562e+01 1 256 +7.748521e+01 2 256 +7.817812e+01 4 256 ### CPU: scaling test 32 -7.544857e+01 1 32 -7.629914e+01 2 32 -7.644630e+01 4 32 +7.829230e+01 1 32 +7.791452e+01 2 32 +7.805661e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.436664e+02 1 256 -1.430259e+02 2 256 -1.425156e+02 4 256 +1.466404e+02 1 256 +1.476809e+02 2 256 +1.468656e+02 4 256 ### CPU: scaling test 32 -1.332283e+02 1 32 -1.407923e+02 2 32 -1.434345e+02 4 32 +1.472104e+02 1 32 +1.473841e+02 2 32 +1.460610e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.322512e+02 1 256 -3.302235e+02 2 256 -3.299895e+02 4 256 +3.405978e+02 1 256 +3.404266e+02 2 256 +3.388501e+02 4 256 ### CPU: scaling test 32 -3.290820e+02 1 32 -3.272276e+02 2 32 -3.284861e+02 4 32 +3.385950e+02 1 32 +3.371712e+02 2 32 +3.272741e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.744622e+02 1 256 -3.794847e+02 2 256 -3.813583e+02 4 256 +3.854137e+02 1 256 +3.936677e+02 2 256 +3.919367e+02 4 256 ### CPU: scaling test 32 -3.817338e+02 1 32 -3.782027e+02 2 32 -3.808702e+02 4 32 +3.908107e+02 1 32 +3.924092e+02 2 32 +3.897875e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.362403e+02 1 256 -3.316419e+02 2 256 -3.338911e+02 4 256 +3.497381e+02 1 256 +3.483810e+02 2 256 +3.405504e+02 4 256 ### CPU: scaling test 32 -3.305571e+02 1 32 -3.318824e+02 2 32 -3.293878e+02 4 32 +3.285955e+02 1 32 +3.474597e+02 2 32 +3.504914e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index cc68408e75..6a92519959 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:29:32 +DATE: 2025-12-07_17:46:42 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.298542e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.302743e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.303449e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.321607e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.325667e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326320e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 0.859583 sec - 3,373,995,346 cycles # 2.854 GHz - 5,824,456,888 instructions # 1.73 insn per cycle - 1.243469488 seconds time elapsed +TOTAL : 0.821747 sec + 3,341,377,270 cycles # 2.932 GHz + 5,898,121,294 instructions # 1.77 insn per cycle + 1.196775904 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.340939e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.341409e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341443e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.334653e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.335074e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.335103e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 2.040862 sec - 6,994,210,497 cycles # 2.880 GHz - 14,374,198,066 instructions # 2.06 insn per cycle - 2.485321107 seconds time elapsed +TOTAL : 2.021138 sec + 7,127,140,917 cycles # 2.956 GHz + 14,907,704,581 instructions # 2.09 insn per cycle + 2.467572881 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.481211e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.481430e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.481430e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.728828e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.729048e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.729048e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.060224 sec - 18,790,658,377 cycles # 2.660 GHz - 53,598,343,943 instructions # 2.85 insn per cycle - 7.064353743 seconds time elapsed +TOTAL : 6.831322 sec + 18,702,297,373 cycles # 2.737 GHz + 53,151,294,412 instructions # 2.84 insn per cycle + 6.835137623 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.428763e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.428836e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.428836e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.461892e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.461977e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.461977e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.697310 sec - 9,985,153,992 cycles # 2.699 GHz - 27,152,471,347 instructions # 2.72 insn per cycle - 3.701453086 seconds time elapsed +TOTAL : 3.613440 sec + 9,961,346,738 cycles # 2.755 GHz + 27,152,677,939 instructions # 2.73 insn per cycle + 3.617289764 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.245847e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.246221e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.246221e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.395264e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.395701e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.395701e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.628561 sec - 4,350,647,315 cycles # 2.666 GHz - 9,591,385,784 instructions # 2.20 insn per cycle - 1.632600458 seconds time elapsed +TOTAL : 1.557901 sec + 4,335,666,635 cycles # 2.777 GHz + 9,591,172,329 instructions # 2.21 insn per cycle + 1.561957003 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.817880e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.818408e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.818408e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.939732e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.940267e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.940267e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.385265 sec - 3,747,713,325 cycles # 2.699 GHz - 8,516,229,683 instructions # 2.27 insn per cycle - 1.389377029 seconds time elapsed +TOTAL : 1.342813 sec + 3,748,544,251 cycles # 2.785 GHz + 8,516,140,066 instructions # 2.27 insn per cycle + 1.346850165 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.278490e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.278974e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.278974e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.497472e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.498006e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.498006e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.612258 sec - 2,716,765,553 cycles # 1.682 GHz - 4,276,097,512 instructions # 1.57 insn per cycle - 1.616451427 seconds time elapsed +TOTAL : 1.512957 sec + 2,715,207,137 cycles # 1.791 GHz + 4,275,323,609 instructions # 1.57 insn per cycle + 1.517015632 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling index 8b91486c13..311de36ce1 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:01:16 +DATE: 2025-12-07_18:17:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -34,29 +28,29 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.582972e+04 1 256 -1.581496e+04 2 256 -1.648948e+04 4 256 -1.646203e+04 8 256 -1.669439e+04 16 256 -1.647826e+04 32 256 -1.616020e+04 64 256 -1.617952e+04 128 256 +1.600417e+04 1 256 +1.586945e+04 2 256 +1.646579e+04 4 256 +1.644366e+04 8 256 +1.661298e+04 16 256 +1.629890e+04 32 256 +1.626662e+04 64 256 +1.605207e+04 128 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -6.365790e+03 1 32 -1.117842e+04 2 32 -1.456730e+04 4 32 -1.611806e+04 8 32 -1.598649e+04 16 32 -1.653700e+04 32 32 -1.595595e+04 64 32 -1.589958e+04 128 32 -1.560604e+04 256 32 -1.549794e+04 512 32 -1.560588e+04 1024 32 +6.551178e+03 1 32 +1.156930e+04 2 32 +1.466119e+04 4 32 +1.618017e+04 8 32 +1.588665e+04 16 32 +1.637067e+04 32 32 +1.585083e+04 64 32 +1.588662e+04 128 32 +1.551750e+04 256 32 +1.550638e+04 512 32 +1.551491e+04 1024 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.550960e+01 1 256 -7.583079e+01 2 256 -7.562936e+01 4 256 +7.886798e+01 1 256 +7.820595e+01 2 256 +7.855823e+01 4 256 ### CPU: scaling test 32 -7.095115e+01 1 32 -7.526184e+01 2 32 -7.561728e+01 4 32 +7.971176e+01 1 32 +7.917743e+01 2 32 +7.945917e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.416397e+02 1 256 -1.419941e+02 2 256 -1.424152e+02 4 256 +1.479520e+02 1 256 +1.494174e+02 2 256 +1.496134e+02 4 256 ### CPU: scaling test 32 -1.379937e+02 1 32 -1.386213e+02 2 32 -1.419191e+02 4 32 +1.487121e+02 1 32 +1.483091e+02 2 32 +1.490177e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.312097e+02 1 256 -3.311144e+02 2 256 -3.322186e+02 4 256 +3.402049e+02 1 256 +3.401223e+02 2 256 +3.353553e+02 4 256 ### CPU: scaling test 32 -3.304901e+02 1 32 -3.322880e+02 2 32 -3.277376e+02 4 32 +3.444222e+02 1 32 +3.416991e+02 2 32 +3.437550e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.821829e+02 1 256 -3.805165e+02 2 256 -3.788227e+02 4 256 +3.963594e+02 1 256 +3.917392e+02 2 256 +3.827051e+02 4 256 ### CPU: scaling test 32 -3.729139e+02 1 32 -3.757926e+02 2 32 -3.738019e+02 4 32 +3.846538e+02 1 32 +3.965483e+02 2 32 +3.718410e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.317613e+02 1 256 -3.319298e+02 2 256 -3.365958e+02 4 256 +3.530580e+02 1 256 +3.540897e+02 2 256 +3.547233e+02 4 256 ### CPU: scaling test 32 -3.353901e+02 1 32 -3.366346e+02 2 32 -3.378136e+02 4 32 +3.553480e+02 1 32 +3.507123e+02 2 32 +3.563661e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 4b40dd2c65..492521bc41 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:32:38 +DATE: 2025-12-07_18:52:18 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -38,14 +32,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.248729e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.286569e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.286569e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.302566e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.313301e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.313301e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 0.825135 sec - 3,263,718,300 cycles # 2.850 GHz - 5,063,977,049 instructions # 1.55 insn per cycle - 1.201910757 seconds time elapsed +TOTAL : 0.815502 sec + 3,300,106,601 cycles # 2.920 GHz + 5,753,571,925 instructions # 1.74 insn per cycle + 1.190500644 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -63,14 +57,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.351586e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.359293e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.359293e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.324470e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.331300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331300e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 2.006826 sec - 6,868,164,513 cycles # 2.869 GHz - 12,771,043,874 instructions # 1.86 insn per cycle - 2.451670895 seconds time elapsed +TOTAL : 2.005788 sec + 7,050,944,864 cycles # 2.950 GHz + 14,956,012,357 instructions # 2.12 insn per cycle + 2.449574903 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -91,14 +85,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.508335e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.508560e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.508560e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.729527e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.729740e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.729740e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.038136 sec - 18,717,847,899 cycles # 2.659 GHz - 53,598,418,673 instructions # 2.86 insn per cycle - 7.042371275 seconds time elapsed +TOTAL : 6.834321 sec + 18,758,889,717 cycles # 2.744 GHz + 53,152,056,226 instructions # 2.83 insn per cycle + 6.838459078 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -118,14 +112,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.418673e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.418747e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.418747e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.462641e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.462717e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.462717e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.725271 sec - 9,999,898,907 cycles # 2.682 GHz - 27,154,408,541 instructions # 2.72 insn per cycle - 3.729470107 seconds time elapsed +TOTAL : 3.611535 sec + 10,029,551,156 cycles # 2.775 GHz + 27,154,364,603 instructions # 2.71 insn per cycle + 3.615661228 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -145,14 +139,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.288517e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.288903e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.288903e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.401721e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.402134e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.402134e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.608418 sec - 4,321,971,855 cycles # 2.681 GHz - 9,593,457,987 instructions # 2.22 insn per cycle - 1.612824235 seconds time elapsed +TOTAL : 1.554885 sec + 4,327,474,958 cycles # 2.777 GHz + 9,593,210,515 instructions # 2.22 insn per cycle + 1.559120852 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -172,14 +166,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.731794e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.732300e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.732300e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.933392e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.933938e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.933938e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.417269 sec - 3,781,284,257 cycles # 2.661 GHz - 8,518,492,306 instructions # 2.25 insn per cycle - 1.421504706 seconds time elapsed +TOTAL : 1.345457 sec + 3,749,368,911 cycles # 2.780 GHz + 8,518,109,411 instructions # 2.27 insn per cycle + 1.349555175 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -199,14 +193,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.320041e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320569e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320569e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.487659e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.488259e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.488259e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.593109 sec - 2,718,981,575 cycles # 1.703 GHz - 4,277,734,000 instructions # 1.57 insn per cycle - 1.597391554 seconds time elapsed +TOTAL : 1.517053 sec + 2,715,282,487 cycles # 1.786 GHz + 4,277,426,001 instructions # 1.58 insn per cycle + 1.521366886 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index a8f385308e..8eea74c3a5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:31:21 +DATE: 2025-12-07_17:48:27 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.314413e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.318852e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.319620e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.307593e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.311911e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.312704e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 0.824375 sec - 3,263,300,002 cycles # 2.859 GHz - 5,743,287,797 instructions # 1.76 insn per cycle - 1.201709138 seconds time elapsed +TOTAL : 0.821952 sec + 3,317,370,082 cycles # 2.908 GHz + 5,678,819,301 instructions # 1.71 insn per cycle + 1.197459104 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.342823e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.343338e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.343373e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.343314e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.343738e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.343769e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 2.030004 sec - 6,944,802,894 cycles # 2.872 GHz - 14,733,879,509 instructions # 2.12 insn per cycle - 2.474432206 seconds time elapsed +TOTAL : 2.019557 sec + 7,107,198,445 cycles # 2.958 GHz + 15,372,061,161 instructions # 2.16 insn per cycle + 2.462329647 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.570860e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.571065e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.571065e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.771624e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.771837e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.771837e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.976560 sec - 18,730,478,677 cycles # 2.684 GHz - 53,589,432,540 instructions # 2.86 insn per cycle - 6.980695916 seconds time elapsed +TOTAL : 6.796782 sec + 18,695,159,140 cycles # 2.750 GHz + 53,144,330,535 instructions # 2.84 insn per cycle + 6.800659193 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.411301e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.411372e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.411372e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.458463e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.458538e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.458538e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.742394 sec - 10,077,544,611 cycles # 2.691 GHz - 27,148,181,137 instructions # 2.69 insn per cycle - 3.746519189 seconds time elapsed +TOTAL : 3.620942 sec + 10,043,257,962 cycles # 2.771 GHz + 27,146,989,501 instructions # 2.70 insn per cycle + 3.624787675 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96336) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.358190e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.358704e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.358704e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.409783e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410183e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410183e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.574465 sec - 4,261,924,263 cycles # 2.701 GHz - 9,596,051,273 instructions # 2.25 insn per cycle - 1.578699681 seconds time elapsed +TOTAL : 1.550764 sec + 4,271,013,266 cycles # 2.748 GHz + 9,596,000,773 instructions # 2.25 insn per cycle + 1.554805786 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.774770e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.775320e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.775320e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.875674e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.876174e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.876174e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.400584 sec - 3,755,242,155 cycles # 2.675 GHz - 8,521,276,194 instructions # 2.27 insn per cycle - 1.404663616 seconds time elapsed +TOTAL : 1.364428 sec + 3,777,129,340 cycles # 2.762 GHz + 8,520,886,982 instructions # 2.26 insn per cycle + 1.368280687 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80635) (512y: 225) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.329909e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.330461e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.330461e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.513200e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.513689e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.513689e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.587980 sec - 2,712,476,158 cycles # 1.704 GHz - 4,282,456,457 instructions # 1.58 insn per cycle - 1.592350341 seconds time elapsed +TOTAL : 1.505667 sec + 2,713,460,031 cycles # 1.798 GHz + 4,281,796,120 instructions # 1.58 insn per cycle + 1.509752311 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2702) (512y: 175) (512z:79107) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling index 2d50000d27..4541366d35 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:49:04 +DATE: 2025-12-07_18:05:47 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -34,30 +28,30 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.189617e+04 1 256 -3.247454e+04 2 256 -3.572888e+04 4 256 -3.576406e+04 8 256 -3.574054e+04 16 256 -3.604686e+04 32 256 -3.591831e+04 64 256 -3.590498e+04 128 256 -3.586335e+04 256 256 +3.177974e+04 1 256 +3.279446e+04 2 256 +3.574681e+04 4 256 +3.582578e+04 8 256 +3.593926e+04 16 256 +3.597564e+04 32 256 +3.600280e+04 64 256 +3.596856e+04 128 256 +3.588532e+04 256 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -7.716223e+03 1 32 -1.405251e+04 2 32 -2.073573e+04 4 32 -2.779764e+04 8 32 -3.326750e+04 16 32 -3.550921e+04 32 32 -3.542979e+04 64 32 -3.536735e+04 128 32 -3.605303e+04 256 32 -3.612470e+04 512 32 -3.604579e+04 1024 32 -3.604477e+04 2048 32 +7.770719e+03 1 32 +1.395323e+04 2 32 +2.096500e+04 4 32 +2.815794e+04 8 32 +3.343612e+04 16 32 +3.569877e+04 32 32 +3.610168e+04 64 32 +3.605983e+04 128 32 +3.630962e+04 256 32 +3.611950e+04 512 32 +3.610563e+04 1024 32 +3.600484e+04 2048 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.499895e+01 1 256 -8.500354e+01 2 256 -8.502793e+01 4 256 +8.779821e+01 1 256 +8.808191e+01 2 256 +8.756926e+01 4 256 ### CPU: scaling test 32 -8.566387e+01 1 32 -8.564579e+01 2 32 -8.546968e+01 4 32 +8.672834e+01 1 32 +8.735170e+01 2 32 +8.504494e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.082111e+02 1 256 -3.057097e+02 2 256 -3.015791e+02 4 256 +3.123127e+02 1 256 +3.137015e+02 2 256 +3.175470e+02 4 256 ### CPU: scaling test 32 -3.031632e+02 1 32 -3.047989e+02 2 32 -3.016953e+02 4 32 +3.217089e+02 1 32 +3.145887e+02 2 32 +3.141821e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.617272e+02 1 256 -6.661900e+02 2 256 -6.680386e+02 4 256 +6.884735e+02 1 256 +6.856273e+02 2 256 +6.916220e+02 4 256 ### CPU: scaling test 32 -6.677614e+02 1 32 -6.719546e+02 2 32 -6.659846e+02 4 32 +6.858923e+02 1 32 +6.849303e+02 2 32 +6.924496e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.611249e+02 1 256 -7.606905e+02 2 256 -7.604096e+02 4 256 +7.847390e+02 1 256 +7.841627e+02 2 256 +7.771554e+02 4 256 ### CPU: scaling test 32 -7.550844e+02 1 32 -7.531491e+02 2 32 -7.562334e+02 4 32 +7.776008e+02 1 32 +7.806739e+02 2 32 +7.699436e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.623690e+02 1 256 -6.648693e+02 2 256 -6.677195e+02 4 256 +7.100109e+02 1 256 +7.056232e+02 2 256 +7.056088e+02 4 256 ### CPU: scaling test 32 -6.549910e+02 1 32 -6.592485e+02 2 32 -6.593529e+02 4 32 +7.181969e+02 1 32 +7.033454e+02 2 32 +7.046953e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 8d906ea4bc..b9ef432745 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:36:41 +DATE: 2025-12-07_17:53:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.066576e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.085305e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.089254e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.149109e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.169607e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.173315e+04 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 -TOTAL : 0.755600 sec - 2,946,115,284 cycles # 2.846 GHz - 5,005,757,693 instructions # 1.70 insn per cycle - 1.092047091 seconds time elapsed +TOTAL : 0.739629 sec + 2,945,277,352 cycles # 2.918 GHz + 4,835,336,809 instructions # 1.64 insn per cycle + 1.066167022 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.576872e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.578746e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.578931e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.596718e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598620e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 1.197902 sec - 4,252,156,323 cycles # 2.858 GHz - 7,968,205,533 instructions # 1.87 insn per cycle - 1.544878632 seconds time elapsed +TOTAL : 1.178241 sec + 4,343,269,414 cycles # 2.946 GHz + 8,331,549,866 instructions # 1.92 insn per cycle + 1.531265727 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.452149e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.452401e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.452401e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.626039e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.626291e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.626291e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.250789 sec - 18,004,786,092 cycles # 2.879 GHz - 53,363,354,008 instructions # 2.96 insn per cycle - 6.254568811 seconds time elapsed +TOTAL : 6.121818 sec + 17,969,147,902 cycles # 2.934 GHz + 53,063,500,600 instructions # 2.95 insn per cycle + 6.125566878 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -101,8 +95,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087517612E-003 -Relative difference = 2.1197460131000295e-08 +Avg ME (F77/C++) = 9.8479612087573973E-003 +Relative difference = 2.1198032444047986e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.083892e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.084249e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.084249e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.186530e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.186892e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186892e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.714898 sec - 4,637,516,396 cycles # 2.699 GHz - 13,808,277,295 instructions # 2.98 insn per cycle - 1.718840547 seconds time elapsed +TOTAL : 1.658991 sec + 4,633,349,982 cycles # 2.788 GHz + 13,807,904,887 instructions # 2.98 insn per cycle + 1.663020094 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.679481e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.681146e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.681146e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.889962e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.891607e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.891607e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.793237 sec - 2,148,565,219 cycles # 2.697 GHz - 4,837,105,097 instructions # 2.25 insn per cycle - 0.797286288 seconds time elapsed +TOTAL : 0.768806 sec + 2,146,573,691 cycles # 2.780 GHz + 4,837,010,097 instructions # 2.25 insn per cycle + 0.772788428 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.502213e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.504225e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.504225e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.794553e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.796736e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.796736e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.706205 sec - 1,896,245,897 cycles # 2.672 GHz - 4,291,845,754 instructions # 2.26 insn per cycle - 0.710269657 seconds time elapsed +TOTAL : 0.679743 sec + 1,898,062,516 cycles # 2.779 GHz + 4,291,798,325 instructions # 2.26 insn per cycle + 0.683721502 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.536289e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.538258e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.538258e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.039508e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.041978e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.041978e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.810162 sec - 1,363,414,955 cycles # 1.676 GHz - 2,159,791,218 instructions # 1.58 insn per cycle - 0.814367082 seconds time elapsed +TOTAL : 0.753271 sec + 1,363,600,272 cycles # 1.803 GHz + 2,159,623,004 instructions # 1.58 insn per cycle + 0.757185983 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling index b311421434..5123b9d4d7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:05:58 +DATE: 2025-12-07_18:22:23 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -34,30 +28,30 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.033893e+04 1 256 -3.187494e+04 2 256 -3.481987e+04 4 256 -3.512251e+04 8 256 -3.538857e+04 16 256 -3.542822e+04 32 256 -3.543221e+04 64 256 -3.537512e+04 128 256 -3.502452e+04 256 256 +3.087667e+04 1 256 +3.200359e+04 2 256 +3.502386e+04 4 256 +3.531850e+04 8 256 +3.544635e+04 16 256 +3.528779e+04 32 256 +3.555934e+04 64 256 +3.539542e+04 128 256 +3.505604e+04 256 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -7.725986e+03 1 32 -1.328194e+04 2 32 -1.942036e+04 4 32 -2.633854e+04 8 32 -3.294887e+04 16 32 -3.493545e+04 32 32 -3.529299e+04 64 32 -3.546637e+04 128 32 -3.548686e+04 256 32 -3.523534e+04 512 32 -3.522952e+04 1024 32 -3.514012e+04 2048 32 +7.815744e+03 1 32 +1.386300e+04 2 32 +1.907493e+04 4 32 +2.668468e+04 8 32 +3.305350e+04 16 32 +3.497059e+04 32 32 +3.546935e+04 64 32 +3.552986e+04 128 32 +3.573964e+04 256 32 +3.569543e+04 512 32 +3.550837e+04 1024 32 +3.521947e+04 2048 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.495344e+01 1 256 -8.539448e+01 2 256 -8.496927e+01 4 256 +8.844169e+01 1 256 +8.829740e+01 2 256 +8.843317e+01 4 256 ### CPU: scaling test 32 -8.470460e+01 1 32 -8.470926e+01 2 32 -8.506051e+01 4 32 +8.825391e+01 1 32 +8.820411e+01 2 32 +8.493115e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.029024e+02 1 256 -3.058068e+02 2 256 -3.092272e+02 4 256 +3.236550e+02 1 256 +3.200762e+02 2 256 +3.161138e+02 4 256 ### CPU: scaling test 32 -3.088673e+02 1 32 -3.061911e+02 2 32 -3.071123e+02 4 32 +3.186986e+02 1 32 +3.212506e+02 2 32 +3.220319e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.653819e+02 1 256 -6.661146e+02 2 256 -6.676979e+02 4 256 +6.928318e+02 1 256 +6.896541e+02 2 256 +6.922588e+02 4 256 ### CPU: scaling test 32 -6.681941e+02 1 32 -6.675336e+02 2 32 -6.688978e+02 4 32 +6.975684e+02 1 32 +6.944282e+02 2 32 +7.011291e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.615474e+02 1 256 -7.624411e+02 2 256 -7.580407e+02 4 256 +7.850778e+02 1 256 +7.879724e+02 2 256 +7.826994e+02 4 256 ### CPU: scaling test 32 -7.724123e+02 1 32 -7.622893e+02 2 32 -7.629688e+02 4 32 +7.864907e+02 1 32 +7.810580e+02 2 32 +7.820229e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.726799e+02 1 256 -6.675111e+02 2 256 -6.619522e+02 4 256 +6.576187e+02 1 256 +6.991658e+02 2 256 +7.121946e+02 4 256 ### CPU: scaling test 32 -6.616673e+02 1 32 -6.588386e+02 2 32 -6.622712e+02 4 32 +7.209230e+02 1 32 +7.182528e+02 2 32 +7.123456e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 66637c5d79..6fcf57922a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:34:27 +DATE: 2025-12-07_18:54:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -38,14 +32,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.846569e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.930073e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.930073e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.926744e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.060634e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.060634e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 0.744004 sec - 2,812,928,508 cycles # 2.768 GHz - 4,058,280,243 instructions # 1.44 insn per cycle - 1.074142514 seconds time elapsed +TOTAL : 0.738139 sec + 2,947,921,871 cycles # 2.927 GHz + 4,777,342,836 instructions # 1.62 insn per cycle + 1.065529832 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -63,14 +57,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.542471e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.575116e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.575116e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.545021e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.577588e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.577588e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856440e-04 +- 8.331090e-05 ) GeV^-6 -TOTAL : 1.186896 sec - 4,180,690,234 cycles # 2.849 GHz - 8,037,777,996 instructions # 1.92 insn per cycle - 1.534789099 seconds time elapsed +TOTAL : 1.185570 sec + 4,270,441,231 cycles # 2.909 GHz + 8,425,554,190 instructions # 1.97 insn per cycle + 1.533113794 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -91,14 +85,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.504304e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.504560e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.504560e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.789711e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.790034e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.790034e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.212057 sec - 17,925,660,588 cycles # 2.884 GHz - 53,364,413,300 instructions # 2.98 insn per cycle - 6.216192253 seconds time elapsed +TOTAL : 6.010643 sec + 17,880,573,489 cycles # 2.973 GHz + 53,069,438,218 instructions # 2.97 insn per cycle + 6.014769904 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,8 +103,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087517612E-003 -Relative difference = 2.1197460131000295e-08 +Avg ME (F77/C++) = 9.8479612087573973E-003 +Relative difference = 2.1198032444047986e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -118,14 +112,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.026780e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.027128e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.027128e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.184990e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.185363e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.185363e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.746031 sec - 4,640,321,340 cycles # 2.653 GHz - 13,810,267,539 instructions # 2.98 insn per cycle - 1.750270483 seconds time elapsed +TOTAL : 1.661957 sec + 4,641,644,565 cycles # 2.787 GHz + 13,810,570,785 instructions # 2.98 insn per cycle + 1.666043966 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -145,14 +139,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.541416e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.543021e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.543021e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.824203e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.825978e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.825978e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.809578 sec - 2,161,931,873 cycles # 2.659 GHz - 4,839,517,439 instructions # 2.24 insn per cycle - 0.813642934 seconds time elapsed +TOTAL : 0.776575 sec + 2,168,771,233 cycles # 2.780 GHz + 4,839,683,324 instructions # 2.23 insn per cycle + 0.780703227 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -172,14 +166,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.420966e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.422988e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.422988e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.363934e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.366292e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.366292e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.714158 sec - 1,911,038,749 cycles # 2.664 GHz - 4,293,943,131 instructions # 2.25 insn per cycle - 0.718267339 seconds time elapsed +TOTAL : 0.720076 sec + 1,899,953,237 cycles # 2.625 GHz + 4,294,516,659 instructions # 2.26 insn per cycle + 0.726363241 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -199,14 +193,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.647126e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.649133e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.649133e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.007976e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.010064e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.010064e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.797274 sec - 1,365,650,123 cycles # 1.706 GHz - 2,161,762,081 instructions # 1.58 insn per cycle - 0.801641364 seconds time elapsed +TOTAL : 0.757250 sec + 1,368,592,495 cycles # 1.799 GHz + 2,162,097,112 instructions # 1.58 insn per cycle + 0.761471357 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index a85d1bcb39..9b1c2a1278 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:38:06 +DATE: 2025-12-07_17:54:59 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.071043e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.090506e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.094612e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.028715e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.049405e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.053377e+04 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 -TOTAL : 0.757789 sec - 2,958,910,358 cycles # 2.847 GHz - 4,794,775,632 instructions # 1.62 insn per cycle - 1.096595085 seconds time elapsed +TOTAL : 0.745994 sec + 2,913,163,919 cycles # 2.868 GHz + 4,760,152,513 instructions # 1.63 insn per cycle + 1.074762167 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.567606e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569510e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569696e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.597807e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.599499e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.599654e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 1.206702 sec - 4,225,242,901 cycles # 2.841 GHz - 8,156,770,765 instructions # 1.93 insn per cycle - 1.554101217 seconds time elapsed +TOTAL : 1.178415 sec + 4,301,764,246 cycles # 2.932 GHz + 8,130,387,788 instructions # 1.89 insn per cycle + 1.533986281 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.507145e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.507418e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.507418e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.772041e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.772292e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.772292e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.208388 sec - 17,992,278,108 cycles # 2.897 GHz - 53,336,143,963 instructions # 2.96 insn per cycle - 6.212278042 seconds time elapsed +TOTAL : 6.027274 sec + 17,930,434,545 cycles # 2.974 GHz + 53,036,661,880 instructions # 2.96 insn per cycle + 6.031104999 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -101,8 +95,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087558014E-003 -Relative difference = 2.119787038556726e-08 +Avg ME (F77/C++) = 9.8479612087558118E-003 +Relative difference = 2.1197871442470395e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.069142e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.069523e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.069523e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.180265e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.180627e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.180627e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.722052 sec - 4,637,939,725 cycles # 2.688 GHz - 13,805,971,610 instructions # 2.98 insn per cycle - 1.726097842 seconds time elapsed +TOTAL : 1.663357 sec + 4,642,171,949 cycles # 2.785 GHz + 13,805,514,119 instructions # 2.97 insn per cycle + 1.667261778 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.610751e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.612520e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.612520e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.738874e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.740458e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.740458e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.800943 sec - 2,170,709,754 cycles # 2.698 GHz - 4,844,490,730 instructions # 2.23 insn per cycle - 0.805141444 seconds time elapsed +TOTAL : 0.785586 sec + 2,180,968,308 cycles # 2.764 GHz + 4,844,391,218 instructions # 2.22 insn per cycle + 0.789623381 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85852) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.606901e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.608951e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.608951e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.830881e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.833220e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.833220e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.696038 sec - 1,884,685,200 cycles # 2.695 GHz - 4,299,634,626 instructions # 2.28 insn per cycle - 0.700035846 seconds time elapsed +TOTAL : 0.676527 sec + 1,882,310,918 cycles # 2.768 GHz + 4,299,640,343 instructions # 2.28 insn per cycle + 0.680439575 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81642) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.489547e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.491608e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.491608e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.796971e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.799278e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.799278e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.816037 sec - 1,366,505,808 cycles # 1.668 GHz - 2,169,050,969 instructions # 1.59 insn per cycle - 0.820326650 seconds time elapsed +TOTAL : 0.779716 sec + 1,367,028,916 cycles # 1.746 GHz + 2,168,966,795 instructions # 1.59 insn per cycle + 0.783747894 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4103) (512y: 24) (512z:79552) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling index 53bb1cfda7..19ef6c0ee9 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:47:09 +DATE: 2025-12-07_18:03:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -34,30 +28,30 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.616958e+04 1 256 -1.637015e+04 2 256 -1.727451e+04 4 256 -1.703878e+04 8 256 -1.713757e+04 16 256 -1.692549e+04 32 256 -1.662520e+04 64 256 -1.655737e+04 128 256 -1.660158e+04 256 256 +1.627329e+04 1 256 +1.647263e+04 2 256 +1.738689e+04 4 256 +1.701273e+04 8 256 +1.722554e+04 16 256 +1.693708e+04 32 256 +1.667015e+04 64 256 +1.664135e+04 128 256 +1.670901e+04 256 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -6.521951e+03 1 32 -1.124531e+04 2 32 -1.474858e+04 4 32 -1.618404e+04 8 32 -1.651807e+04 16 32 -1.695250e+04 32 32 -1.681150e+04 64 32 -1.629231e+04 128 32 -1.600637e+04 256 32 -1.595680e+04 512 32 -1.609152e+04 1024 32 -1.606225e+04 2048 32 +6.509973e+03 1 32 +1.078738e+04 2 32 +1.457067e+04 4 32 +1.616595e+04 8 32 +1.660179e+04 16 32 +1.725296e+04 32 32 +1.648741e+04 64 32 +1.625850e+04 128 32 +1.609187e+04 256 32 +1.587389e+04 512 32 +1.589051e+04 1024 32 +1.601358e+04 2048 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.530837e+01 1 256 -7.486415e+01 2 256 -7.494008e+01 4 256 +7.933723e+01 1 256 +7.989921e+01 2 256 +8.031081e+01 4 256 ### CPU: scaling test 32 -7.525282e+01 1 32 -7.477017e+01 2 32 -7.524610e+01 4 32 +8.112513e+01 1 32 +8.132681e+01 2 32 +8.099957e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.548840e+02 1 256 -1.522353e+02 2 256 -1.543201e+02 4 256 +1.565204e+02 1 256 +1.581031e+02 2 256 +1.615367e+02 4 256 ### CPU: scaling test 32 -1.576268e+02 1 32 -1.582873e+02 2 32 -1.506909e+02 4 32 +1.581474e+02 1 32 +1.593892e+02 2 32 +1.577300e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.557154e+02 1 256 -3.547270e+02 2 256 -3.557554e+02 4 256 +3.707440e+02 1 256 +3.707998e+02 2 256 +3.698300e+02 4 256 ### CPU: scaling test 32 -3.614135e+02 1 32 -3.600100e+02 2 32 -3.596141e+02 4 32 +3.712890e+02 1 32 +3.678840e+02 2 32 +3.703469e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.001766e+02 1 256 -4.125953e+02 2 256 -4.090213e+02 4 256 +4.233199e+02 1 256 +4.224402e+02 2 256 +4.237560e+02 4 256 ### CPU: scaling test 32 -4.084924e+02 1 32 -4.056804e+02 2 32 -4.080579e+02 4 32 +4.332241e+02 1 32 +4.244444e+02 2 32 +4.216637e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.519966e+02 1 256 -3.510473e+02 2 256 -3.460383e+02 4 256 +3.715837e+02 1 256 +3.686908e+02 2 256 +3.666350e+02 4 256 ### CPU: scaling test 32 -3.459963e+02 1 32 -3.417875e+02 2 32 -3.469620e+02 4 32 +3.653430e+02 1 32 +3.625616e+02 2 32 +3.441728e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 686f1c46c7..87348923e2 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:33:09 +DATE: 2025-12-07_17:50:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.606719e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.613205e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.614399e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.624310e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.630584e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.631682e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 0.810711 sec - 3,229,171,179 cycles # 2.859 GHz - 5,715,641,917 instructions # 1.77 insn per cycle - 1.191471752 seconds time elapsed +TOTAL : 0.782942 sec + 3,249,168,586 cycles # 2.933 GHz + 5,773,198,722 instructions # 1.78 insn per cycle + 1.164526016 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.654245e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.655018e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.655075e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.660665e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.661312e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.661378e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 1.784420 sec - 6,293,809,246 cycles # 2.879 GHz - 12,593,045,017 instructions # 2.00 insn per cycle - 2.242570146 seconds time elapsed +TOTAL : 1.748930 sec + 6,307,036,551 cycles # 2.956 GHz + 12,670,020,112 instructions # 2.01 insn per cycle + 2.190906711 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.469254e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.469466e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.469466e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.071086 sec - 19,047,832,122 cycles # 2.693 GHz - 53,831,188,921 instructions # 2.83 insn per cycle - 7.075248115 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.999803e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.000031e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.000031e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 6.600290 sec + 18,197,723,482 cycles # 2.756 GHz + 52,170,801,990 instructions # 2.87 insn per cycle + 6.604160737 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -101,8 +95,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595861831675E-003 -Relative difference = 3.457988134687711e-07 +Avg ME (F77/C++) = 9.8722595126688548E-003 +Relative difference = 3.5324536475016105e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.520487e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.520570e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.520570e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.474834 sec - 9,355,185,296 cycles # 2.691 GHz - 25,920,357,243 instructions # 2.77 insn per cycle - 3.478986906 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.619633e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.619725e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.619725e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 3.277113 sec + 9,304,611,265 cycles # 2.840 GHz + 25,912,492,496 instructions # 2.78 insn per cycle + 3.282377904 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -128,8 +122,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594844308162E-003 -Relative difference = 3.5610570575237004e-07 +Avg ME (F77/C++) = 9.8722594304054192E-003 +Relative difference = 3.6157814879843527e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.467313e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.467816e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.467816e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.523962 sec - 3,999,825,927 cycles # 2.619 GHz - 9,105,365,579 instructions # 2.28 insn per cycle - 1.528167166 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.664154e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.664643e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.664643e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.444115 sec + 4,013,917,183 cycles # 2.773 GHz + 9,093,755,824 instructions # 2.27 insn per cycle + 1.448052934 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -155,8 +149,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.083261e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.083882e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.083882e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.295937 sec - 3,509,301,061 cycles # 2.701 GHz - 8,040,567,810 instructions # 2.29 insn per cycle - 1.299964950 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.233998e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.234595e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.234595e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.249873 sec + 3,494,279,838 cycles # 2.789 GHz + 8,028,963,188 instructions # 2.30 insn per cycle + 1.253757045 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79768) (512y: 45) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -182,8 +176,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.452173e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.452727e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.452727e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.532017 sec - 2,596,809,497 cycles # 1.691 GHz - 4,060,850,927 instructions # 1.56 insn per cycle - 1.536186135 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.664950e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.665489e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.665489e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.443393 sec + 2,590,689,840 cycles # 1.791 GHz + 4,053,288,164 instructions # 1.56 insn per cycle + 1.447520144 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2509) (512y: 61) (512z:78957) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -209,8 +203,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling index a739246eca..d85d917459 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:03:38 +DATE: 2025-12-07_18:20:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -34,29 +28,29 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.525607e+04 1 256 -1.592603e+04 2 256 -1.694297e+04 4 256 -1.694752e+04 8 256 -1.680152e+04 16 256 -1.667228e+04 32 256 -1.648853e+04 64 256 -1.642335e+04 128 256 +1.548860e+04 1 256 +1.606035e+04 2 256 +1.688370e+04 4 256 +1.669184e+04 8 256 +1.691229e+04 16 256 +1.666360e+04 32 256 +1.642998e+04 64 256 +1.658560e+04 128 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -5.344354e+03 1 32 -9.059524e+03 2 32 -1.316587e+04 4 32 -1.535902e+04 8 32 -1.599627e+04 16 32 -1.690040e+04 32 32 -1.613824e+04 64 32 -1.606066e+04 128 32 -1.607094e+04 256 32 -1.586333e+04 512 32 -1.570749e+04 1024 32 +5.435922e+03 1 32 +9.645585e+03 2 32 +1.346360e+04 4 32 +1.553062e+04 8 32 +1.595057e+04 16 32 +1.681603e+04 32 32 +1.619719e+04 64 32 +1.611712e+04 128 32 +1.602198e+04 256 32 +1.580951e+04 512 32 +1.577712e+04 1024 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.451618e+01 1 256 -7.447961e+01 2 256 -7.464296e+01 4 256 +8.086273e+01 1 256 +8.116661e+01 2 256 +8.153265e+01 4 256 ### CPU: scaling test 32 -7.454429e+01 1 32 -7.454562e+01 2 32 -7.491906e+01 4 32 +8.134251e+01 1 32 +8.153318e+01 2 32 +8.156860e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.523430e+02 1 256 -1.528849e+02 2 256 -1.545423e+02 4 256 +1.534870e+02 1 256 +1.600587e+02 2 256 +1.572512e+02 4 256 ### CPU: scaling test 32 -1.508465e+02 1 32 -1.522871e+02 2 32 -1.514789e+02 4 32 +1.716957e+02 1 32 +1.596835e+02 2 32 +1.599762e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.569891e+02 1 256 -3.579373e+02 2 256 -3.580811e+02 4 256 +3.703501e+02 1 256 +3.729621e+02 2 256 +3.717527e+02 4 256 ### CPU: scaling test 32 -3.582840e+02 1 32 -3.591263e+02 2 32 -3.590191e+02 4 32 +3.685675e+02 1 32 +3.717963e+02 2 32 +3.699275e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.091335e+02 1 256 -4.101923e+02 2 256 -4.047677e+02 4 256 +4.266806e+02 1 256 +4.238099e+02 2 256 +4.246639e+02 4 256 ### CPU: scaling test 32 -4.052367e+02 1 32 -4.049500e+02 2 32 -4.058871e+02 4 32 +4.257438e+02 1 32 +4.251832e+02 2 32 +4.244526e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.457958e+02 1 256 -3.518110e+02 2 256 -3.523691e+02 4 256 +3.693821e+02 1 256 +3.550443e+02 2 256 +3.712438e+02 4 256 ### CPU: scaling test 32 -3.457462e+02 1 32 -3.517526e+02 2 32 -3.507713e+02 4 32 +3.712753e+02 1 32 +3.700499e+02 2 32 +3.731274e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 2c63694669..e4a0d6daf4 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:34:55 +DATE: 2025-12-07_17:51:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.591312e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.597916e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.599015e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.612319e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.619577e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.620651e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 0.809629 sec - 3,237,669,928 cycles # 2.864 GHz - 5,681,011,752 instructions # 1.75 insn per cycle - 1.192308721 seconds time elapsed +TOTAL : 0.781423 sec + 3,236,322,839 cycles # 2.937 GHz + 5,698,114,882 instructions # 1.76 insn per cycle + 1.159089047 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.667525e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.668322e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.668373e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.672311e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.672962e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.673008e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 1.762250 sec - 6,151,588,956 cycles # 2.862 GHz - 12,789,871,898 instructions # 2.08 insn per cycle - 2.206834958 seconds time elapsed +TOTAL : 1.731134 sec + 6,255,729,538 cycles # 2.954 GHz + 12,741,248,602 instructions # 2.04 insn per cycle + 2.174495296 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.441824e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.442030e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.442030e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.097119 sec - 19,021,241,015 cycles # 2.679 GHz - 53,824,218,201 instructions # 2.83 insn per cycle - 7.101056562 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.997348e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.997573e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.997573e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 6.616407 sec + 18,326,128,515 cycles # 2.769 GHz + 52,164,100,831 instructions # 2.85 insn per cycle + 6.620439694 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -101,8 +95,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595861831675E-003 -Relative difference = 3.457988134687711e-07 +Avg ME (F77/C++) = 9.8722595126688548E-003 +Relative difference = 3.5324536475016105e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.520581e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.520672e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.520672e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.473548 sec - 9,360,233,363 cycles # 2.692 GHz - 25,827,022,283 instructions # 2.76 insn per cycle - 3.477681834 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.555852e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.555935e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.555935e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 3.395086 sec + 9,397,771,728 cycles # 2.766 GHz + 25,818,623,325 instructions # 2.75 insn per cycle + 3.399033591 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:95883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -128,8 +122,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594844308162E-003 -Relative difference = 3.5610570575237004e-07 +Avg ME (F77/C++) = 9.8722594304054192E-003 +Relative difference = 3.6157814879843527e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.499910e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.500338e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.500338e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.510429 sec - 4,054,458,858 cycles # 2.678 GHz - 9,070,411,764 instructions # 2.24 insn per cycle - 1.514545882 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.579117e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.579554e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.579554e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.477213 sec + 4,052,732,886 cycles # 2.738 GHz + 9,059,448,547 instructions # 2.24 insn per cycle + 1.481033717 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83452) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -155,8 +149,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.057773e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.058358e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.058358e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.302962 sec - 3,492,520,706 cycles # 2.673 GHz - 8,024,600,361 instructions # 2.30 insn per cycle - 1.307117868 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.126094e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.126701e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.126701e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.281866 sec + 3,515,111,954 cycles # 2.736 GHz + 8,013,597,076 instructions # 2.28 insn per cycle + 1.285775622 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79136) (512y: 215) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -182,8 +176,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.494027e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.494558e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494558e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.513587 sec - 2,591,602,459 cycles # 1.708 GHz - 4,056,631,617 instructions # 1.57 insn per cycle - 1.517867253 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.576376e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.576923e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.576923e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.478652 sec + 2,583,870,342 cycles # 1.744 GHz + 4,049,465,037 instructions # 1.57 insn per cycle + 1.482775583 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1776) (512y: 165) (512z:78888) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -209,8 +203,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling index f1df17a77c..d8baf39e83 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:44:03 +DATE: 2025-12-07_18:00:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.428635e+06 1 256 -2.986921e+06 2 256 -5.564976e+06 4 256 -1.150400e+07 8 256 -2.254241e+07 16 256 -3.299328e+07 32 256 -3.991678e+07 64 256 -4.342243e+07 128 256 -4.801742e+07 256 256 -5.029240e+07 512 256 -5.134165e+07 1024 256 +1.559891e+06 1 256 +3.162583e+06 2 256 +6.107343e+06 4 256 +1.191571e+07 8 256 +2.189859e+07 16 256 +3.261004e+07 32 256 +3.952829e+07 64 256 +4.356172e+07 128 256 +4.835766e+07 256 256 +5.044422e+07 512 256 +5.113145e+07 1024 256 ### GPU: scaling test 32 -1.949995e+05 1 32 -3.776925e+05 2 32 -7.282783e+05 4 32 -1.483318e+06 8 32 -2.934652e+06 16 32 -4.620001e+06 32 32 -1.110479e+07 64 32 -2.248141e+07 128 32 -3.497298e+07 256 32 -3.843258e+07 512 32 -4.371853e+07 1024 32 -4.702509e+07 2048 32 -4.914143e+07 4096 32 -5.007560e+07 8192 32 +1.803355e+05 1 32 +3.599429e+05 2 32 +7.736289e+05 4 32 +1.514918e+06 8 32 +3.100944e+06 16 32 +6.042724e+06 32 32 +1.235111e+07 64 32 +2.253930e+07 128 32 +3.534401e+07 256 32 +4.065680e+07 512 32 +4.275273e+07 1024 32 +4.738283e+07 2048 32 +4.933248e+07 4096 32 +5.009291e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.018202e+05 1 256 -1.029861e+05 2 256 -1.049904e+05 4 256 +1.010994e+05 1 256 +1.028131e+05 2 256 +1.027232e+05 4 256 ### CPU: scaling test 32 -9.750093e+04 1 32 -9.993083e+04 2 32 -1.029180e+05 4 32 +9.681509e+04 1 32 +9.798968e+04 2 32 +9.790454e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.770505e+05 1 256 -1.765797e+05 2 256 -1.854054e+05 4 256 +1.713177e+05 1 256 +1.809723e+05 2 256 +1.849161e+05 4 256 ### CPU: scaling test 32 -1.484850e+05 1 32 -1.713608e+05 2 32 -1.595040e+05 4 32 +1.653978e+05 1 32 +1.730020e+05 2 32 +1.695712e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.857545e+05 1 256 -3.168191e+05 2 256 -3.177122e+05 4 256 +3.057509e+05 1 256 +3.193732e+05 2 256 +3.178064e+05 4 256 ### CPU: scaling test 32 -2.953038e+05 1 32 -3.077116e+05 2 32 -2.876185e+05 4 32 +3.172212e+05 1 32 +3.212803e+05 2 32 +3.165707e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.080307e+05 1 256 -3.180421e+05 2 256 -3.341884e+05 4 256 +3.322673e+05 1 256 +3.378042e+05 2 256 +3.394207e+05 4 256 ### CPU: scaling test 32 -2.868052e+05 1 32 -3.156394e+05 2 32 -3.097819e+05 4 32 +3.370018e+05 1 32 +3.409370e+05 2 32 +3.070811e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.313974e+05 1 256 -2.307900e+05 2 256 -2.293449e+05 4 256 +2.316493e+05 1 256 +2.298113e+05 2 256 +2.277677e+05 4 256 ### CPU: scaling test 32 -2.313560e+05 1 32 -2.290500e+05 2 32 -2.289947e+05 4 32 +2.319664e+05 1 32 +2.202651e+05 2 32 +2.189449e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index d112a11495..ea46d19fab 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:27:25 +DATE: 2025-12-07_17:44:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.313564e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.022320e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.232850e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.412124e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022600e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.215804e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.462516 sec - 1,997,687,796 cycles # 2.814 GHz - 2,748,418,377 instructions # 1.38 insn per cycle - 0.769002804 seconds time elapsed +TOTAL : 0.457836 sec + 2,052,448,702 cycles # 2.885 GHz + 2,800,353,381 instructions # 1.36 insn per cycle + 0.769343461 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.849800e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.989232e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.162437e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.900737e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.001137e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.184265e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537675 sec - 2,303,047,279 cycles # 2.838 GHz - 3,173,611,128 instructions # 1.38 insn per cycle - 0.868680787 seconds time elapsed +TOTAL : 0.530570 sec + 2,318,147,647 cycles # 2.907 GHz + 3,216,957,621 instructions # 1.39 insn per cycle + 0.856472399 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.039909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.062156e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.062156e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.073767e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073767e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.595860 sec - 4,617,130,408 cycles # 2.888 GHz - 13,249,342,927 instructions # 2.87 insn per cycle - 1.599801948 seconds time elapsed +TOTAL : 1.576487 sec + 4,723,613,011 cycles # 2.990 GHz + 13,278,393,134 instructions # 2.81 insn per cycle + 1.580491440 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.827783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.896147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.896147e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.883693e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953204e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.915570 sec - 2,669,358,674 cycles # 2.905 GHz - 7,600,949,147 instructions # 2.85 insn per cycle - 0.919765484 seconds time elapsed +TOTAL : 0.887936 sec + 2,661,601,083 cycles # 2.987 GHz + 7,600,474,615 instructions # 2.86 insn per cycle + 0.891896259 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.046861e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.237725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.237725e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.114843e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.309303e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.309303e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.557374 sec - 1,530,133,486 cycles # 2.729 GHz - 3,193,359,124 instructions # 2.09 insn per cycle - 0.561538714 seconds time elapsed +TOTAL : 0.544643 sec + 1,524,464,910 cycles # 2.783 GHz + 3,193,740,070 instructions # 2.09 insn per cycle + 0.548523217 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.222833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.436298e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.436298e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.301964e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.520407e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520407e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.527914 sec - 1,448,845,809 cycles # 2.727 GHz - 3,068,216,889 instructions # 2.12 insn per cycle - 0.532005288 seconds time elapsed +TOTAL : 0.514889 sec + 1,448,429,793 cycles # 2.796 GHz + 3,068,203,829 instructions # 2.12 insn per cycle + 0.518611284 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.262309e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.366937e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.366937e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.388517e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.502894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.502894e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.746275 sec - 1,345,907,467 cycles # 1.795 GHz - 1,981,512,387 instructions # 1.47 insn per cycle - 0.750498916 seconds time elapsed +TOTAL : 0.707107 sec + 1,342,180,343 cycles # 1.889 GHz + 1,981,543,689 instructions # 1.48 insn per cycle + 0.711102060 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 542ec194e9..dbfa452716 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_16:30:42 +DATE: 2025-12-07_18:50:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.356662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.903029e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.903029e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.379608e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.869466e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.869466e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.490080 sec - 2,074,202,921 cycles # 2.819 GHz - 2,982,362,559 instructions # 1.44 insn per cycle - 0.792779275 seconds time elapsed +TOTAL : 0.485696 sec + 2,102,018,256 cycles # 2.885 GHz + 3,032,210,046 instructions # 1.44 insn per cycle + 0.786396788 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -82,14 +76,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.203461e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.181328e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.181328e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.169732e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.067578e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.067578e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.757533 sec - 2,979,284,817 cycles # 2.853 GHz - 4,399,436,734 instructions # 1.48 insn per cycle - 1.101470538 seconds time elapsed +TOTAL : 0.754667 sec + 3,019,614,859 cycles # 2.913 GHz + 4,478,271,976 instructions # 1.48 insn per cycle + 1.095276992 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -110,14 +104,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.040166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.062990e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.062990e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.061358e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061358e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.601584 sec - 4,649,519,147 cycles # 2.897 GHz - 13,253,744,210 instructions # 2.85 insn per cycle - 1.606011259 seconds time elapsed +TOTAL : 1.601655 sec + 4,753,927,285 cycles # 2.962 GHz + 13,282,813,554 instructions # 2.79 insn per cycle + 1.605782306 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.815648e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.884893e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.884893e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.858336e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.927996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927996e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.929220 sec - 2,705,069,112 cycles # 2.900 GHz - 7,649,258,945 instructions # 2.83 insn per cycle - 0.933656370 seconds time elapsed +TOTAL : 0.906829 sec + 2,695,806,913 cycles # 2.962 GHz + 7,647,614,090 instructions # 2.84 insn per cycle + 0.910743852 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.970773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.160922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.160922e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.073408e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.266702e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.266702e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.579438 sec - 1,570,726,943 cycles # 2.694 GHz - 3,243,232,441 instructions # 2.06 insn per cycle - 0.583677287 seconds time elapsed +TOTAL : 0.560203 sec + 1,567,740,727 cycles # 2.781 GHz + 3,243,520,812 instructions # 2.07 insn per cycle + 0.564477950 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.172484e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.386570e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.386570e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.250464e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.467546e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.467546e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.544496 sec - 1,490,247,847 cycles # 2.718 GHz - 3,118,276,131 instructions # 2.09 insn per cycle - 0.548976134 seconds time elapsed +TOTAL : 0.530928 sec + 1,481,573,140 cycles # 2.771 GHz + 3,118,251,090 instructions # 2.10 insn per cycle + 0.535071942 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -218,14 +212,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.208001e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.313270e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.313270e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.314571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.423650e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.423650e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.771513 sec - 1,385,006,024 cycles # 1.787 GHz - 2,018,418,785 instructions # 1.46 insn per cycle - 0.775891856 seconds time elapsed +TOTAL : 0.736752 sec + 1,376,904,155 cycles # 1.860 GHz + 2,020,155,495 instructions # 1.47 insn per cycle + 0.740938765 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index c96c0f2bba..d1a0055d76 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:27:47 +DATE: 2025-12-07_17:45:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.222648e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.903995e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.118782e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.399219e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.955646e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.146333e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.464819 sec - 2,030,821,916 cycles # 2.839 GHz - 2,744,793,219 instructions # 1.35 insn per cycle - 0.772863650 seconds time elapsed +TOTAL : 0.456810 sec + 2,041,094,274 cycles # 2.898 GHz + 2,815,083,550 instructions # 1.38 insn per cycle + 0.761874601 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.790256e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.896792e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.070548e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.871193e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.920673e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.092031e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.539655 sec - 2,316,213,602 cycles # 2.850 GHz - 3,194,995,847 instructions # 1.38 insn per cycle - 0.870686173 seconds time elapsed +TOTAL : 0.529715 sec + 2,313,659,879 cycles # 2.908 GHz + 3,219,420,757 instructions # 1.39 insn per cycle + 0.855053105 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.036091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.058176e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058176e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.047762e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.069666e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069666e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.601117 sec - 4,614,781,714 cycles # 2.877 GHz - 13,227,683,016 instructions # 2.87 insn per cycle - 1.605070443 seconds time elapsed +TOTAL : 1.583114 sec + 4,724,926,803 cycles # 2.979 GHz + 13,256,970,325 instructions # 2.81 insn per cycle + 1.586971741 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.832083e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.900484e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.900484e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.820678e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.888233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.888233e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.913405 sec - 2,666,905,925 cycles # 2.909 GHz - 7,595,681,340 instructions # 2.85 insn per cycle - 0.917462386 seconds time elapsed +TOTAL : 0.918673 sec + 2,665,796,439 cycles # 2.892 GHz + 7,596,140,186 instructions # 2.85 insn per cycle + 0.922497858 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3077) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.997059e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.186796e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.186796e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.131559e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.325326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.325326e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.566232 sec - 1,532,545,982 cycles # 2.690 GHz - 3,190,811,369 instructions # 2.08 insn per cycle - 0.570104783 seconds time elapsed +TOTAL : 0.541853 sec + 1,527,464,925 cycles # 2.802 GHz + 3,190,711,080 instructions # 2.09 insn per cycle + 0.545622743 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3005) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.138120e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.345703e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.345703e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.266282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.483523e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.483523e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.542027 sec - 1,447,882,232 cycles # 2.655 GHz - 3,062,649,899 instructions # 2.12 insn per cycle - 0.545967207 seconds time elapsed +TOTAL : 0.520838 sec + 1,445,897,016 cycles # 2.759 GHz + 3,063,177,557 instructions # 2.12 insn per cycle + 0.524701989 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2804) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.226133e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.328099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.328099e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.279629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.388632e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.388632e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.757778 sec - 1,343,211,600 cycles # 1.765 GHz - 1,978,672,810 instructions # 1.47 insn per cycle - 0.761787399 seconds time elapsed +TOTAL : 0.740548 sec + 1,345,418,119 cycles # 1.809 GHz + 1,978,529,476 instructions # 1.47 insn per cycle + 0.744490522 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1416) (512y: 84) (512z: 2209) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling index 8a82307bae..fde967cddd 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:44:45 +DATE: 2025-12-07_18:01:35 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.527045e+06 1 256 -3.131556e+06 2 256 -6.093388e+06 4 256 -1.251780e+07 8 256 -2.244630e+07 16 256 -4.178995e+07 32 256 -6.592442e+07 64 256 -7.658956e+07 128 256 -8.216021e+07 256 256 -8.838611e+07 512 256 -9.244041e+07 1024 256 +1.545978e+06 1 256 +3.085062e+06 2 256 +5.882995e+06 4 256 +1.170279e+07 8 256 +2.406764e+07 16 256 +4.501346e+07 32 256 +6.393631e+07 64 256 +7.829738e+07 128 256 +8.244580e+07 256 256 +8.834816e+07 512 256 +9.289275e+07 1024 256 ### GPU: scaling test 32 -1.864346e+05 1 32 -3.981461e+05 2 32 -7.916041e+05 4 32 -1.446352e+06 8 32 -2.861310e+06 16 32 -6.255536e+06 32 32 -1.192410e+07 64 32 -2.215132e+07 128 32 -4.236701e+07 256 32 -6.877647e+07 512 32 -7.973525e+07 1024 32 -8.551740e+07 2048 32 -9.532558e+07 4096 32 -9.914765e+07 8192 32 +1.860108e+05 1 32 +4.013949e+05 2 32 +7.858740e+05 4 32 +1.409762e+06 8 32 +3.145427e+06 16 32 +6.225492e+06 32 32 +1.060091e+07 64 32 +2.354470e+07 128 32 +4.403260e+07 256 32 +6.568207e+07 512 32 +7.963023e+07 1024 32 +8.619041e+07 2048 32 +9.573204e+07 4096 32 +9.952312e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.054964e+05 1 256 -1.086764e+05 2 256 -1.085879e+05 4 256 +1.082948e+05 1 256 +1.072635e+05 2 256 +1.086301e+05 4 256 ### CPU: scaling test 32 -9.631447e+04 1 32 -1.042281e+05 2 32 -1.016890e+05 4 32 +9.166559e+04 1 32 +9.551812e+04 2 32 +1.061641e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.679848e+05 1 256 -2.830096e+05 2 256 -2.920388e+05 4 256 +2.752164e+05 1 256 +2.763836e+05 2 256 +2.910255e+05 4 256 ### CPU: scaling test 32 -2.003030e+05 1 32 -2.733186e+05 2 32 -2.733314e+05 4 32 +2.591366e+05 1 32 +2.785140e+05 2 32 +2.698293e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.015207e+05 1 256 -5.639568e+05 2 256 -5.644473e+05 4 256 +6.010744e+05 1 256 +6.046678e+05 2 256 +5.536733e+05 4 256 ### CPU: scaling test 32 -5.530113e+05 1 32 -5.540310e+05 2 32 -6.104453e+05 4 32 +4.530717e+05 1 32 +5.269789e+05 2 32 +6.080183e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.318601e+05 1 256 -5.672087e+05 2 256 -5.418454e+05 4 256 +6.323736e+05 1 256 +6.414545e+05 2 256 +6.388684e+05 4 256 ### CPU: scaling test 32 -4.569666e+05 1 32 -5.422212e+05 2 32 -5.271481e+05 4 32 +6.280051e+05 1 32 +6.379077e+05 2 32 +6.397889e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.266468e+05 1 256 -4.319869e+05 2 256 -4.643166e+05 4 256 +4.625071e+05 1 256 +4.641070e+05 2 256 +4.587238e+05 4 256 ### CPU: scaling test 32 -4.562174e+05 1 32 -4.628927e+05 2 32 -4.441638e+05 4 32 +4.426125e+05 1 32 +4.571788e+05 2 32 +4.530316e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 3c2f832038..a6e113641f 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:28:49 +DATE: 2025-12-07_17:46:02 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.775185e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.659813e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.119856e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.092434e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.795067e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.190203e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.460990 sec - 2,032,870,493 cycles # 2.841 GHz - 2,757,410,394 instructions # 1.36 insn per cycle - 0.774218584 seconds time elapsed +TOTAL : 0.449920 sec + 2,018,569,542 cycles # 2.906 GHz + 2,801,263,290 instructions # 1.39 insn per cycle + 0.751830557 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.197057e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.828077e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.174418e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.393078e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.955674e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.250572e+07 ) sec^-1 MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.492525 sec - 2,151,242,968 cycles # 2.846 GHz - 2,972,332,872 instructions # 1.38 insn per cycle - 0.812892837 seconds time elapsed +TOTAL : 0.486272 sec + 2,179,837,924 cycles # 2.911 GHz + 3,028,972,128 instructions # 1.39 insn per cycle + 0.807531221 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.088774e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.113486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.113486e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.110060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.135013e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.135013e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.523041 sec - 4,438,181,728 cycles # 2.908 GHz - 12,997,899,281 instructions # 2.93 insn per cycle - 1.526979824 seconds time elapsed +TOTAL : 1.493243 sec + 4,454,581,022 cycles # 2.977 GHz + 13,073,832,839 instructions # 2.93 insn per cycle + 1.497047141 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.813324e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.986491e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.986491e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.912945e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.093985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.093985e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.599748 sec - 1,741,244,369 cycles # 2.889 GHz - 4,565,155,972 instructions # 2.62 insn per cycle - 0.603721432 seconds time elapsed +TOTAL : 0.579270 sec + 1,737,738,306 cycles # 2.985 GHz + 4,565,063,526 instructions # 2.63 insn per cycle + 0.582916642 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.470584e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.128186e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.128186e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.494533e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.146588e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.146588e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.317328 sec - 874,197,910 cycles # 2.725 GHz - 1,937,671,895 instructions # 2.22 insn per cycle - 0.321309948 seconds time elapsed +TOTAL : 0.315833 sec + 873,968,612 cycles # 2.740 GHz + 1,937,610,645 instructions # 2.22 insn per cycle + 0.319472162 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.732936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.453145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.453145e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.696388e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.416718e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.416718e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.303630 sec - 837,570,844 cycles # 2.728 GHz - 1,865,428,267 instructions # 2.23 insn per cycle - 0.307759201 seconds time elapsed +TOTAL : 0.305796 sec + 838,792,643 cycles # 2.715 GHz + 1,865,446,250 instructions # 2.22 insn per cycle + 0.309563058 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.363450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.779212e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.779212e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.228649e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.630678e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.630678e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.396164 sec - 743,365,153 cycles # 1.861 GHz - 1,320,595,546 instructions # 1.78 insn per cycle - 0.400174159 seconds time elapsed +TOTAL : 0.408073 sec + 740,699,668 cycles # 1.801 GHz + 1,320,632,331 instructions # 1.78 insn per cycle + 0.411972779 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 3158a41f16..dc5ef34cfb 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_16:31:01 +DATE: 2025-12-07_18:50:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.164266e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.164377e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.164377e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.326832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.232907e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.232907e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429183e+01 ) GeV^-2 -TOTAL : 0.466915 sec - 2,002,533,494 cycles # 2.818 GHz - 2,846,516,929 instructions # 1.42 insn per cycle - 0.767921314 seconds time elapsed +TOTAL : 0.464429 sec + 2,058,653,378 cycles # 2.890 GHz + 2,915,995,644 instructions # 1.42 insn per cycle + 0.770600434 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -82,14 +76,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.935448e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.962699e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.962699e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.846605e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837940e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837940e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609941e+02 +- 2.115589e+02 ) GeV^-2 -TOTAL : 0.638881 sec - 2,551,134,973 cycles # 2.829 GHz - 3,814,025,702 instructions # 1.50 insn per cycle - 0.960291968 seconds time elapsed +TOTAL : 0.636023 sec + 2,563,647,443 cycles # 2.847 GHz + 3,801,547,024 instructions # 1.48 insn per cycle + 0.957625401 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -110,14 +104,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.072670e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.097133e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.097133e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.099430e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.123806e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.123806e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.549724 sec - 4,455,261,943 cycles # 2.869 GHz - 13,001,491,970 instructions # 2.92 insn per cycle - 1.553804785 seconds time elapsed +TOTAL : 1.511344 sec + 4,472,918,250 cycles # 2.954 GHz + 13,077,475,043 instructions # 2.92 insn per cycle + 1.515170451 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.775020e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.950077e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.950077e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.866274e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.042510e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.042510e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.612678 sec - 1,763,964,947 cycles # 2.863 GHz - 4,612,364,671 instructions # 2.61 insn per cycle - 0.616741606 seconds time elapsed +TOTAL : 0.592955 sec + 1,760,206,978 cycles # 2.953 GHz + 4,612,711,813 instructions # 2.62 insn per cycle + 0.596712417 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.406265e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.059656e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.059656e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.560552e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.225940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.225940e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.325484 sec - 894,227,621 cycles # 2.718 GHz - 1,973,650,274 instructions # 2.21 insn per cycle - 0.329612707 seconds time elapsed +TOTAL : 0.316525 sec + 892,676,779 cycles # 2.791 GHz + 1,973,951,566 instructions # 2.21 insn per cycle + 0.320406859 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.495052e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.198837e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.198837e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.767680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.495706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.495706e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.321201 sec - 866,167,930 cycles # 2.668 GHz - 1,901,550,421 instructions # 2.20 insn per cycle - 0.325340653 seconds time elapsed +TOTAL : 0.305863 sec + 856,182,367 cycles # 2.770 GHz + 1,901,281,000 instructions # 2.22 insn per cycle + 0.309760269 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -218,14 +212,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.189669e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.585230e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.585230e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.246469e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.646011e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.646011e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.417280 sec - 768,093,760 cycles # 1.825 GHz - 1,361,032,349 instructions # 1.77 insn per cycle - 0.423250195 seconds time elapsed +TOTAL : 0.411142 sec + 765,357,320 cycles # 1.847 GHz + 1,361,322,198 instructions # 1.78 insn per cycle + 0.415115548 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 8874a06c98..28379a6af1 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:29:09 +DATE: 2025-12-07_17:46:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.726166e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.668422e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.110300e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.203852e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.893691e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.285942e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.456732 sec - 1,986,727,615 cycles # 2.822 GHz - 2,734,105,162 instructions # 1.38 insn per cycle - 0.761604044 seconds time elapsed +TOTAL : 0.449462 sec + 2,011,080,689 cycles # 2.900 GHz + 2,776,287,193 instructions # 1.38 insn per cycle + 0.750462800 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 163 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.139451e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.748092e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.065888e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.395855e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.945398e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.242840e+07 ) sec^-1 MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.491750 sec - 2,144,083,987 cycles # 2.843 GHz - 2,965,934,309 instructions # 1.38 insn per cycle - 0.811495819 seconds time elapsed +TOTAL : 0.481684 sec + 2,149,897,064 cycles # 2.906 GHz + 2,993,575,904 instructions # 1.39 insn per cycle + 0.796943556 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.088510e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.113295e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.113295e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.112897e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.138299e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138299e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.523573 sec - 4,436,604,782 cycles # 2.906 GHz - 12,976,159,794 instructions # 2.92 insn per cycle - 1.527521775 seconds time elapsed +TOTAL : 1.489510 sec + 4,453,662,934 cycles # 2.984 GHz + 13,052,116,511 instructions # 2.93 insn per cycle + 1.493339165 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 635) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.835028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.015163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.015163e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.915195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.093884e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.093884e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.596717 sec - 1,741,466,538 cycles # 2.902 GHz - 4,559,733,587 instructions # 2.62 insn per cycle - 0.600733453 seconds time elapsed +TOTAL : 0.578808 sec + 1,737,931,197 cycles # 2.987 GHz + 4,559,701,975 instructions # 2.62 insn per cycle + 0.582517061 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3592) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.380055e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.028758e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.028758e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.642605e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.310158e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.310158e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.322659 sec - 877,270,879 cycles # 2.691 GHz - 1,934,809,792 instructions # 2.21 insn per cycle - 0.326541378 seconds time elapsed +TOTAL : 0.307708 sec + 873,831,935 cycles # 2.812 GHz + 1,934,891,112 instructions # 2.21 insn per cycle + 0.311323940 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3579) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.601915e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.305503e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.305503e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.903888e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.658554e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.658554e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.310801 sec - 841,602,182 cycles # 2.678 GHz - 1,861,524,675 instructions # 2.21 insn per cycle - 0.314890210 seconds time elapsed +TOTAL : 0.294373 sec + 837,337,003 cycles # 2.815 GHz + 1,861,455,182 instructions # 2.22 insn per cycle + 0.298105174 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3449) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.229370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.636992e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.636992e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.555326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.005879e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.005879e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.407631 sec - 742,675,842 cycles # 1.807 GHz - 1,318,218,015 instructions # 1.77 insn per cycle - 0.411673396 seconds time elapsed +TOTAL : 0.379034 sec + 741,526,458 cycles # 1.941 GHz + 1,318,196,991 instructions # 1.78 insn per cycle + 0.382861566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1996) (512y: 2) (512z: 2428) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling index 86c9b7a546..38ffe4090a 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:44:24 +DATE: 2025-12-07_18:01:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.435943e+06 1 256 -3.007907e+06 2 256 -5.634857e+06 4 256 -1.139868e+07 8 256 -2.191875e+07 16 256 -3.261770e+07 32 256 -3.913775e+07 64 256 -4.321439e+07 128 256 -4.782407e+07 256 256 -5.013042e+07 512 256 -5.117203e+07 1024 256 +1.555304e+06 1 256 +2.988978e+06 2 256 +5.894645e+06 4 256 +1.105575e+07 8 256 +2.217962e+07 16 256 +3.314667e+07 32 256 +3.835673e+07 64 256 +4.412580e+07 128 256 +4.800304e+07 256 256 +5.051797e+07 512 256 +5.121281e+07 1024 256 ### GPU: scaling test 32 -1.833223e+05 1 32 -3.625426e+05 2 32 -7.314829e+05 4 32 -1.459646e+06 8 32 -2.859760e+06 16 32 -5.667384e+06 32 32 -1.106459e+07 64 32 -2.218503e+07 128 32 -3.531887e+07 256 32 -3.896073e+07 512 32 -4.341558e+07 1024 32 -4.714542e+07 2048 32 -4.934308e+07 4096 32 -4.999316e+07 8192 32 +2.084718e+05 1 32 +3.953277e+05 2 32 +7.824917e+05 4 32 +1.319574e+06 8 32 +3.119402e+06 16 32 +5.616160e+06 32 32 +9.585726e+06 64 32 +2.295167e+07 128 32 +3.529300e+07 256 32 +3.979964e+07 512 32 +4.329009e+07 1024 32 +4.749829e+07 2048 32 +4.901632e+07 4096 32 +5.017713e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.008880e+05 1 256 -1.037575e+05 2 256 -1.026899e+05 4 256 +1.029806e+05 1 256 +1.019252e+05 2 256 +1.029857e+05 4 256 ### CPU: scaling test 32 -8.543860e+04 1 32 -9.559401e+04 2 32 -9.690869e+04 4 32 +9.553095e+04 1 32 +8.954581e+04 2 32 +9.956209e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.755069e+05 1 256 -1.824668e+05 2 256 -1.862361e+05 4 256 +1.798688e+05 1 256 +1.829001e+05 2 256 +1.857394e+05 4 256 ### CPU: scaling test 32 -1.737091e+05 1 32 -1.676543e+05 2 32 -1.681730e+05 4 32 +1.782075e+05 1 32 +1.661181e+05 2 32 +1.755565e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.270964e+05 1 256 -3.057259e+05 2 256 -3.141285e+05 4 256 +3.283129e+05 1 256 +3.287324e+05 2 256 +3.320010e+05 4 256 ### CPU: scaling test 32 -2.994544e+05 1 32 -3.090295e+05 2 32 -3.346475e+05 4 32 +3.301556e+05 1 32 +3.168631e+05 2 32 +3.282843e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.254054e+05 1 256 -3.252183e+05 2 256 -3.259569e+05 4 256 +3.497994e+05 1 256 +3.477196e+05 2 256 +3.530979e+05 4 256 ### CPU: scaling test 32 -3.498874e+05 1 32 -3.542076e+05 2 32 -3.198481e+05 4 32 +3.505697e+05 1 32 +3.531853e+05 2 32 +3.537065e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.243613e+05 1 256 -2.351291e+05 2 256 -2.345114e+05 4 256 +2.351813e+05 1 256 +2.349751e+05 2 256 +2.343375e+05 4 256 ### CPU: scaling test 32 -2.301860e+05 1 32 -2.329857e+05 2 32 -2.104986e+05 4 32 +2.159958e+05 1 32 +2.350349e+05 2 32 +2.284250e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index d3f2e68af7..8fef45174d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:28:08 +DATE: 2025-12-07_17:45:23 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.235119e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.971049e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.180643e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.424814e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.028061e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.216676e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.464283 sec - 2,023,320,904 cycles # 2.839 GHz - 2,773,493,223 instructions # 1.37 insn per cycle - 0.771475737 seconds time elapsed +TOTAL : 0.458652 sec + 2,077,152,698 cycles # 2.905 GHz + 2,843,293,582 instructions # 1.37 insn per cycle + 0.773710998 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.827739e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.997089e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.176442e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.907349e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.001756e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.178480e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537726 sec - 2,282,885,717 cycles # 2.817 GHz - 3,160,756,797 instructions # 1.38 insn per cycle - 0.868903156 seconds time elapsed +TOTAL : 0.527832 sec + 2,314,831,509 cycles # 2.912 GHz + 3,228,871,474 instructions # 1.39 insn per cycle + 0.852590730 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.042873e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.065099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.065099e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.060844e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083016e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.591072 sec - 4,638,115,400 cycles # 2.909 GHz - 13,236,410,026 instructions # 2.85 insn per cycle - 1.595277597 seconds time elapsed +TOTAL : 1.563285 sec + 4,690,049,419 cycles # 2.995 GHz + 13,333,451,257 instructions # 2.84 insn per cycle + 1.567255534 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247483100282887 +Relative difference = 4.842759750343022e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.832450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.902450e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.902450e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.891021e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.961493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.961493e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.913352 sec - 2,653,863,508 cycles # 2.895 GHz - 7,455,424,096 instructions # 2.81 insn per cycle - 0.917427770 seconds time elapsed +TOTAL : 0.884699 sec + 2,649,946,077 cycles # 2.984 GHz + 7,451,412,729 instructions # 2.81 insn per cycle + 0.888654390 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482733329694 -Relative difference = 5.100316128927506e-07 +Avg ME (F77/C++) = 0.14247482642920581 +Relative difference = 5.163772298069564e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.117188e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.318909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.318909e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.267035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.480357e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.480357e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.545094 sec - 1,478,675,993 cycles # 2.696 GHz - 3,118,440,007 instructions # 2.11 insn per cycle - 0.549086981 seconds time elapsed +TOTAL : 0.520453 sec + 1,473,032,375 cycles # 2.813 GHz + 3,114,248,477 instructions # 2.11 insn per cycle + 0.524341905 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.250725e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.471460e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.471460e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.421284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.654965e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.654965e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.523896 sec - 1,401,490,342 cycles # 2.658 GHz - 2,993,266,123 instructions # 2.14 insn per cycle - 0.527885129 seconds time elapsed +TOTAL : 0.498153 sec + 1,400,092,122 cycles # 2.793 GHz + 2,988,376,365 instructions # 2.13 insn per cycle + 0.501973762 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2873) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.231374e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.335386e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.335386e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.403237e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.517629e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.517629e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.756616 sec - 1,324,382,086 cycles # 1.743 GHz - 1,938,261,257 instructions # 1.46 insn per cycle - 0.760681799 seconds time elapsed +TOTAL : 0.702567 sec + 1,322,886,892 cycles # 1.875 GHz + 1,934,670,746 instructions # 1.46 insn per cycle + 0.706598785 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1363) (512y: 70) (512z: 2196) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 7ec5b5c818..3d8a4d8f27 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:28:30 +DATE: 2025-12-07_17:45:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.256105e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.967576e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.174354e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.388535e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967550e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.151628e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.463340 sec - 2,028,215,818 cycles # 2.846 GHz - 2,776,961,604 instructions # 1.37 insn per cycle - 0.769909609 seconds time elapsed +TOTAL : 0.453084 sec + 2,033,177,679 cycles # 2.913 GHz + 2,813,101,165 instructions # 1.38 insn per cycle + 0.755271859 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.777604e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.905810e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.079424e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.873869e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.908960e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.079468e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537813 sec - 2,311,546,315 cycles # 2.847 GHz - 3,204,384,721 instructions # 1.39 insn per cycle - 0.869430768 seconds time elapsed +TOTAL : 0.532479 sec + 2,258,806,562 cycles # 2.820 GHz + 3,177,677,640 instructions # 1.41 insn per cycle + 0.858731735 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.027944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049964e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.049964e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.058573e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.080699e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.080699e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.613580 sec - 4,641,772,345 cycles # 2.871 GHz - 13,214,748,096 instructions # 2.85 insn per cycle - 1.617579626 seconds time elapsed +TOTAL : 1.566883 sec + 4,697,798,294 cycles # 2.993 GHz + 13,311,633,890 instructions # 2.83 insn per cycle + 1.570820489 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247483100282887 +Relative difference = 4.842759750343022e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.824575e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893158e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893158e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.893758e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.964394e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.964394e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.916995 sec - 2,647,231,235 cycles # 2.877 GHz - 7,451,993,603 instructions # 2.82 insn per cycle - 0.920907127 seconds time elapsed +TOTAL : 0.883475 sec + 2,648,834,413 cycles # 2.987 GHz + 7,448,102,745 instructions # 2.81 insn per cycle + 0.887387358 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3057) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482733329694 -Relative difference = 5.100316128927506e-07 +Avg ME (F77/C++) = 0.14247482642920581 +Relative difference = 5.163772298069564e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.116778e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320418e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320418e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.239616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.450021e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.450021e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.545336 sec - 1,472,587,180 cycles # 2.683 GHz - 3,116,400,718 instructions # 2.12 insn per cycle - 0.549340783 seconds time elapsed +TOTAL : 0.524496 sec + 1,473,213,661 cycles # 2.791 GHz + 3,112,152,755 instructions # 2.11 insn per cycle + 0.528287959 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3043) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.223699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.443094e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.443094e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.455603e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.690202e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.690202e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.528265 sec - 1,399,996,992 cycles # 2.634 GHz - 2,990,999,773 instructions # 2.14 insn per cycle - 0.532237029 seconds time elapsed +TOTAL : 0.492901 sec + 1,395,948,598 cycles # 2.813 GHz + 2,986,433,499 instructions # 2.14 insn per cycle + 0.496779444 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2854) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.302312e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.410857e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.410857e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.394394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.508624e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.508624e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.733431 sec - 1,324,620,583 cycles # 1.798 GHz - 1,936,852,170 instructions # 1.46 insn per cycle - 0.737506511 seconds time elapsed +TOTAL : 0.705106 sec + 1,320,302,465 cycles # 1.864 GHz + 1,933,114,904 instructions # 1.46 insn per cycle + 0.708944911 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1344) (512y: 70) (512z: 2196) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 14462fa0eb..b9c84d5fc0 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:04:42 +DATE: 2025-12-07_19:50:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.654485e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.404459e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.690060e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.499232e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.344503e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.664319e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.541401 sec - 2,305,332,177 cycles # 2.847 GHz - 3,197,913,952 instructions # 1.39 insn per cycle - 0.868100814 seconds time elapsed +TOTAL : 0.538359 sec + 2,372,934,941 cycles # 2.919 GHz + 3,300,641,062 instructions # 1.39 insn per cycle + 0.871969046 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.571130e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.606300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.606300e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.597888e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.632915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.632915e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.786947 sec - 19,519,870,393 cycles # 2.875 GHz - 52,258,888,975 instructions # 2.68 insn per cycle - 6.792671431 seconds time elapsed +TOTAL : 6.670067 sec + 19,790,382,228 cycles # 2.965 GHz + 52,408,555,946 instructions # 2.65 insn per cycle + 6.675493531 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.857187e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.984563e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.984563e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.920243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.049982e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.049982e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.780938 sec - 10,994,068,173 cycles # 2.904 GHz - 30,917,710,259 instructions # 2.81 insn per cycle - 3.786765562 seconds time elapsed +TOTAL : 3.700315 sec + 11,051,003,213 cycles # 2.983 GHz + 30,918,131,093 instructions # 2.80 insn per cycle + 3.705710406 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.468427e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.776131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.776131e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.711210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.049728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.049728e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.458667 sec - 6,708,728,258 cycles # 2.723 GHz - 13,712,517,378 instructions # 2.04 insn per cycle - 2.464482201 seconds time elapsed +TOTAL : 2.334546 sec + 6,493,801,307 cycles # 2.777 GHz + 13,712,299,639 instructions # 2.11 insn per cycle + 2.339859877 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2936) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.847459e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.209715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.209715e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.006810e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.382049e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.382049e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.275732 sec - 6,180,724,079 cycles # 2.710 GHz - 13,193,237,105 instructions # 2.13 insn per cycle - 2.281442783 seconds time elapsed +TOTAL : 2.203069 sec + 6,172,878,992 cycles # 2.796 GHz + 13,191,345,481 instructions # 2.14 insn per cycle + 2.208468868 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2714) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.203485e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.355713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.355713e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.425328e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.596961e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.596961e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.384877 sec - 5,997,535,040 cycles # 1.769 GHz - 8,705,216,175 instructions # 1.45 insn per cycle - 3.390523516 seconds time elapsed +TOTAL : 3.170305 sec + 6,028,915,347 cycles # 1.899 GHz + 8,706,535,911 instructions # 1.44 insn per cycle + 3.175667925 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1546) (512y: 106) (512z: 1954) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index c1b909362e..601bac6c1b 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:05:16 +DATE: 2025-12-07_19:51:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.602305e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.299861e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.572992e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.460521e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.251331e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.556502e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.543522 sec - 2,289,271,142 cycles # 2.817 GHz - 3,205,208,831 instructions # 1.40 insn per cycle - 0.870293269 seconds time elapsed +TOTAL : 0.537779 sec + 2,384,976,935 cycles # 2.919 GHz + 3,329,989,997 instructions # 1.40 insn per cycle + 0.873968223 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.653039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.691951e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.691951e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.681755e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.721234e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.721234e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.455303 sec - 18,685,885,377 cycles # 2.893 GHz - 50,237,697,539 instructions # 2.69 insn per cycle - 6.460495783 seconds time elapsed +TOTAL : 6.343609 sec + 18,990,099,332 cycles # 2.992 GHz + 50,387,827,115 instructions # 2.65 insn per cycle + 6.349149778 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.954178e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.091326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.091326e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.108011e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.255309e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.255309e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.661921 sec - 10,461,474,208 cycles # 2.853 GHz - 29,320,644,078 instructions # 2.80 insn per cycle - 3.667913174 seconds time elapsed +TOTAL : 3.481656 sec + 10,453,162,999 cycles # 2.999 GHz + 29,321,190,758 instructions # 2.81 insn per cycle + 3.486975314 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2712) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.223646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.500682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.500682e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.397481e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.685784e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.685784e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.594203 sec - 6,988,437,642 cycles # 2.689 GHz - 15,195,785,073 instructions # 2.17 insn per cycle - 2.599980482 seconds time elapsed +TOTAL : 2.494176 sec + 6,986,273,206 cycles # 2.797 GHz + 15,196,051,795 instructions # 2.18 insn per cycle + 2.499495433 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3011) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.417064e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.714981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.714981e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.470489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.768170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.768170e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.485778 sec - 6,715,707,590 cycles # 2.696 GHz - 14,680,064,315 instructions # 2.19 insn per cycle - 2.491527768 seconds time elapsed +TOTAL : 2.454903 sec + 6,737,285,136 cycles # 2.739 GHz + 14,678,401,759 instructions # 2.18 insn per cycle + 2.460369908 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2612) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.163644e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.312325e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.312325e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.345729e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.505986e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505986e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.425924 sec - 6,178,650,952 cycles # 1.801 GHz - 10,506,622,006 instructions # 1.70 insn per cycle - 3.431763355 seconds time elapsed +TOTAL : 3.241663 sec + 6,172,260,482 cycles # 1.902 GHz + 10,505,911,520 instructions # 1.70 insn per cycle + 3.246942287 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1317) (512y: 216) (512z: 2136) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 32d858512c..842b57c1b6 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:06:56 +DATE: 2025-12-07_19:52:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.746430e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.525187e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.618301e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.469309e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.519758e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.614209e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.494982 sec - 2,135,489,785 cycles # 2.833 GHz - 2,986,554,714 instructions # 1.40 insn per cycle - 0.812364995 seconds time elapsed +TOTAL : 0.494208 sec + 2,177,050,619 cycles # 2.903 GHz + 3,032,774,451 instructions # 1.39 insn per cycle + 0.808193980 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 99 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.639930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.679722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.679722e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.692382e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.733367e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.733367e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.483754 sec - 18,765,516,643 cycles # 2.893 GHz - 51,374,423,413 instructions # 2.74 insn per cycle - 6.489228485 seconds time elapsed +TOTAL : 6.281822 sec + 18,778,600,913 cycles # 2.988 GHz + 51,374,119,524 instructions # 2.74 insn per cycle + 6.286947062 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.904149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155838e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.155838e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.061733e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.329738e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.329738e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.775203 sec - 8,009,571,813 cycles # 2.881 GHz - 19,418,906,078 instructions # 2.42 insn per cycle - 2.780526828 seconds time elapsed +TOTAL : 2.667494 sec + 7,988,259,502 cycles # 2.990 GHz + 19,416,338,334 instructions # 2.43 insn per cycle + 2.672653519 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3524) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.670886e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.626596e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.626596e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.979751e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.994085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.994085e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.456000 sec - 3,972,178,441 cycles # 2.719 GHz - 8,869,239,722 instructions # 2.23 insn per cycle - 1.461741307 seconds time elapsed +TOTAL : 1.399608 sec + 3,971,951,433 cycles # 2.829 GHz + 8,869,165,930 instructions # 2.23 insn per cycle + 1.404847438 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3709) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.928240e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.948874e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.948874e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.170772e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.225364e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.225364e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.411952 sec - 3,818,419,324 cycles # 2.695 GHz - 8,547,519,956 instructions # 2.24 insn per cycle - 1.417398798 seconds time elapsed +TOTAL : 1.369236 sec + 3,895,741,782 cycles # 2.836 GHz + 8,547,304,446 instructions # 2.19 insn per cycle + 1.374277178 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3594) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.574912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.065441e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.065441e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.936006e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.464227e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.464227e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.971243 sec - 3,626,432,325 cycles # 1.835 GHz - 6,319,513,510 instructions # 1.74 insn per cycle - 1.976911767 seconds time elapsed +TOTAL : 1.853308 sec + 3,628,616,116 cycles # 1.954 GHz + 6,319,812,802 instructions # 1.74 insn per cycle + 1.858514145 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2377) (512y: 0) (512z: 2299) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 218c8378c2..bd944fab38 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:07:25 +DATE: 2025-12-07_19:53:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.779658e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.535884e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.628235e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.498291e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.535174e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.631464e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.493747 sec - 2,136,570,540 cycles # 2.832 GHz - 2,955,252,814 instructions # 1.38 insn per cycle - 0.811353108 seconds time elapsed +TOTAL : 0.488348 sec + 2,195,631,001 cycles # 2.917 GHz + 3,067,116,970 instructions # 1.40 insn per cycle + 0.810654338 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 100 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.693969e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.736524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.736524e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.749989e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795302e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795302e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.279316 sec - 18,165,491,134 cycles # 2.891 GHz - 49,676,906,698 instructions # 2.73 insn per cycle - 6.284692119 seconds time elapsed +TOTAL : 6.079419 sec + 18,187,089,880 cycles # 2.990 GHz + 49,676,523,595 instructions # 2.73 insn per cycle + 6.084557079 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 607) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.443862e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.778187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.778187e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.544163e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.884573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.884573e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.449024 sec - 7,084,328,481 cycles # 2.887 GHz - 18,582,770,693 instructions # 2.62 insn per cycle - 2.454447463 seconds time elapsed +TOTAL : 2.394879 sec + 7,090,687,022 cycles # 2.956 GHz + 18,582,800,903 instructions # 2.62 insn per cycle + 2.400016880 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3222) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.216367e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.641236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.641236e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.450055e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.909919e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.909919e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.098866 sec - 5,652,855,011 cycles # 2.688 GHz - 10,909,770,006 instructions # 1.93 insn per cycle - 2.104181652 seconds time elapsed +TOTAL : 2.010341 sec + 5,665,447,412 cycles # 2.812 GHz + 10,909,503,614 instructions # 1.93 insn per cycle + 2.015535049 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4283) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.314509e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.753400e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.753400e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.553437e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.023565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.023565e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.062043 sec - 5,590,274,103 cycles # 2.706 GHz - 10,617,976,090 instructions # 1.90 insn per cycle - 2.067292425 seconds time elapsed +TOTAL : 1.972980 sec + 5,578,187,369 cycles # 2.821 GHz + 10,615,988,400 instructions # 1.90 insn per cycle + 1.978660825 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4142) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.151626e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.412256e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.412256e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.323304e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.597573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.597573e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.614832 sec - 4,741,117,769 cycles # 1.810 GHz - 8,743,372,129 instructions # 1.84 insn per cycle - 2.620465706 seconds time elapsed +TOTAL : 2.511881 sec + 4,745,808,007 cycles # 1.886 GHz + 8,743,032,391 instructions # 1.84 insn per cycle + 2.517128259 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2850) (512y: 0) (512z: 2889) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index f4ff8c446a..d46708efda 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:05:47 +DATE: 2025-12-07_19:51:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.626534e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.403274e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.688448e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.491084e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.355448e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.671383e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.543452 sec - 2,301,166,740 cycles # 2.836 GHz - 3,210,334,164 instructions # 1.40 insn per cycle - 0.870784678 seconds time elapsed +TOTAL : 0.538159 sec + 2,342,164,709 cycles # 2.910 GHz + 3,287,695,503 instructions # 1.40 insn per cycle + 0.863418052 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.489645e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.521138e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.521138e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.532573e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.564701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.564701e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 7.151635 sec - 20,539,261,330 cycles # 2.870 GHz - 52,312,072,955 instructions # 2.55 insn per cycle - 7.157317940 seconds time elapsed +TOTAL : 6.951068 sec + 20,590,496,337 cycles # 2.960 GHz + 52,260,907,600 instructions # 2.54 insn per cycle + 6.956528947 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711782756741 -Relative difference = 1.9050183377028104e-07 +Avg ME (F77/C++) = 4.3134711542529578 +Relative difference = 1.9607106344435203e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.635024e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.743558e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.743558e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.662308e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.772758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.772758e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 4.091108 sec - 11,568,480,565 cycles # 2.825 GHz - 30,592,470,506 instructions # 2.64 insn per cycle - 4.096724147 seconds time elapsed +TOTAL : 4.048666 sec + 11,575,664,145 cycles # 2.856 GHz + 30,553,850,100 instructions # 2.64 insn per cycle + 4.054084967 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778081822 -Relative difference = 1.9061021324348284e-07 +Avg ME (F77/C++) = 4.3134711065803470 +Relative difference = 2.0712309084777445e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.442158e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.748594e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.748594e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.664036e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.996133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.996133e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.473093 sec - 6,663,246,815 cycles # 2.689 GHz - 13,582,195,938 instructions # 2.04 insn per cycle - 2.478977008 seconds time elapsed +TOTAL : 2.356178 sec + 6,658,302,794 cycles # 2.820 GHz + 13,562,921,509 instructions # 2.04 insn per cycle + 2.361522326 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.658370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.993226e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.993226e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.881697e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.239443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.239443e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.362618 sec - 6,353,039,315 cycles # 2.684 GHz - 13,072,016,547 instructions # 2.06 insn per cycle - 2.368607155 seconds time elapsed +TOTAL : 2.256973 sec + 6,358,268,451 cycles # 2.812 GHz + 13,051,032,250 instructions # 2.05 insn per cycle + 2.262443799 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2867) (512y: 130) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.116355e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.262209e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.262209e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.315545e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.475298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.475298e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.476875 sec - 6,216,987,973 cycles # 1.786 GHz - 8,426,779,606 instructions # 1.36 insn per cycle - 3.483074770 seconds time elapsed +TOTAL : 3.270412 sec + 6,229,631,837 cycles # 1.902 GHz + 8,410,109,037 instructions # 1.35 insn per cycle + 3.275946145 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1598) (512y: 96) (512z: 1978) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index f78a78f7e9..4fc1a137cc 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:06:21 +DATE: 2025-12-07_19:52:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.581022e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.292223e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.567393e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.477961e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.263880e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.568884e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.541711 sec - 2,303,336,148 cycles # 2.840 GHz - 3,222,227,466 instructions # 1.40 insn per cycle - 0.868265701 seconds time elapsed +TOTAL : 0.536640 sec + 2,359,119,302 cycles # 2.927 GHz + 3,316,781,338 instructions # 1.41 insn per cycle + 0.863698088 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.563907e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.598575e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.598575e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.618561e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.654433e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.654433e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.817167 sec - 19,709,237,083 cycles # 2.890 GHz - 50,290,409,188 instructions # 2.55 insn per cycle - 6.822753554 seconds time elapsed +TOTAL : 6.587409 sec + 19,716,568,347 cycles # 2.991 GHz + 50,237,612,561 instructions # 2.55 insn per cycle + 6.592953617 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711782756741 -Relative difference = 1.9050183377028104e-07 +Avg ME (F77/C++) = 4.3134711542529578 +Relative difference = 1.9607106344435203e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.841525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.969254e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969254e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.903565e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.036657e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.036657e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.802477 sec - 11,003,460,648 cycles # 2.890 GHz - 29,103,019,269 instructions # 2.64 insn per cycle - 3.808301655 seconds time elapsed +TOTAL : 3.721956 sec + 11,008,672,174 cycles # 2.954 GHz + 29,065,058,212 instructions # 2.64 insn per cycle + 3.727307630 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2766) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778081822 -Relative difference = 1.9061021324348284e-07 +Avg ME (F77/C++) = 4.3134711065803470 +Relative difference = 2.0712309084777445e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.769392e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.987989e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.987989e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.850332e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.074452e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.074452e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.893528 sec - 7,880,875,441 cycles # 2.719 GHz - 15,079,012,118 instructions # 1.91 insn per cycle - 2.899352011 seconds time elapsed +TOTAL : 2.832478 sec + 7,882,068,237 cycles # 2.779 GHz + 15,060,348,175 instructions # 1.91 insn per cycle + 2.837975842 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3163) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.967773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.208568e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.208568e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.112935e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.361031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.361031e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.753936 sec - 7,508,856,368 cycles # 2.722 GHz - 14,417,603,283 instructions # 1.92 insn per cycle - 2.759752652 seconds time elapsed +TOTAL : 2.657722 sec + 7,489,720,611 cycles # 2.813 GHz + 14,398,703,635 instructions # 1.92 insn per cycle + 2.663057286 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2737) (512y: 304) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.068489e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.209462e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.209462e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.277401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.433326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.433326e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.528645 sec - 6,308,539,404 cycles # 1.786 GHz - 9,645,872,961 instructions # 1.53 insn per cycle - 3.534370742 seconds time elapsed +TOTAL : 3.307884 sec + 6,293,956,505 cycles # 1.900 GHz + 9,629,504,446 instructions # 1.53 insn per cycle + 3.313281770 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1371) (512y: 204) (512z: 2172) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index b64bd08c6e..2b31eb34c9 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:02:19 +DATE: 2025-12-07_19:48:16 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.749715e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.123100e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.185595e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.745217e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.107488e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.169996e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.460632 sec - 2,016,310,298 cycles # 2.828 GHz - 2,811,062,777 instructions # 1.39 insn per cycle - 0.771405460 seconds time elapsed +TOTAL : 0.454875 sec + 2,008,963,582 cycles # 2.820 GHz + 2,813,564,367 instructions # 1.40 insn per cycle + 0.769244257 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.798297e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.902790e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.910598e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.740729e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.848086e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.855119e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483683 sec - 2,080,405,450 cycles # 2.828 GHz - 2,919,633,235 instructions # 1.40 insn per cycle - 0.795243442 seconds time elapsed +TOTAL : 0.478798 sec + 2,126,527,395 cycles # 2.922 GHz + 3,038,567,698 instructions # 1.43 insn per cycle + 0.788185065 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.386932e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.390193e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.390193e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.528110e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.531659e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.531659e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.158198 sec - 459,847,306 cycles # 2.852 GHz - 1,381,276,044 instructions # 3.00 insn per cycle - 0.161817794 seconds time elapsed +TOTAL : 0.152082 sec + 460,051,911 cycles # 2.970 GHz + 1,380,028,247 instructions # 3.00 insn per cycle + 0.155502468 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.255945e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.267065e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.267065e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.574355e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.586648e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.586648e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.086223 sec - 240,474,211 cycles # 2.695 GHz - 691,658,857 instructions # 2.88 insn per cycle - 0.089852973 seconds time elapsed +TOTAL : 0.082367 sec + 239,921,551 cycles # 2.815 GHz + 691,615,487 instructions # 2.88 insn per cycle + 0.085888893 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.385213e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.390914e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.390914e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.438821e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.444576e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.444576e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.040134 sec - 114,132,005 cycles # 2.644 GHz - 258,038,380 instructions # 2.26 insn per cycle - 0.043763583 seconds time elapsed +TOTAL : 0.038722 sec + 113,989,450 cycles # 2.732 GHz + 257,906,777 instructions # 2.26 insn per cycle + 0.042222071 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8583) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.538966e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.546528e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.546528e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.592348e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.599320e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.599320e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.036228 sec - 103,692,755 cycles # 2.641 GHz - 240,622,200 instructions # 2.32 insn per cycle - 0.039728552 seconds time elapsed +TOTAL : 0.035118 sec + 103,307,522 cycles # 2.718 GHz + 240,607,279 instructions # 2.33 insn per cycle + 0.038599534 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8271) (512y: 130) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.148417e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.153199e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.153199e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.191911e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197556e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.197556e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.048211 sec - 90,387,142 cycles # 1.755 GHz - 134,612,621 instructions # 1.49 insn per cycle - 0.052002771 seconds time elapsed +TOTAL : 0.046367 sec + 89,922,162 cycles # 1.827 GHz + 134,561,841 instructions # 1.50 insn per cycle + 0.049964580 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2130) (512y: 104) (512z: 7074) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index 4db43dd255..4b75720a5d 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:02:42 +DATE: 2025-12-07_19:48:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.803202e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.245341e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.763853e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.128846e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.192306e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.458543 sec - 2,011,139,566 cycles # 2.825 GHz - 2,801,263,226 instructions # 1.39 insn per cycle - 0.769027350 seconds time elapsed +TOTAL : 0.454346 sec + 2,062,485,118 cycles # 2.911 GHz + 2,854,325,849 instructions # 1.38 insn per cycle + 0.765933681 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.788680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.895418e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.902637e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.837853e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.948940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.956520e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483711 sec - 2,072,169,922 cycles # 2.815 GHz - 2,948,772,929 instructions # 1.42 insn per cycle - 0.795276590 seconds time elapsed +TOTAL : 0.476970 sec + 2,093,139,828 cycles # 2.883 GHz + 2,962,990,828 instructions # 1.42 insn per cycle + 0.786280898 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.383885e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.387148e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.387148e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.554340e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.557910e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.557910e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.157412 sec - 457,302,712 cycles # 2.851 GHz - 1,376,801,855 instructions # 3.01 insn per cycle - 0.160964317 seconds time elapsed +TOTAL : 0.150100 sec + 457,467,439 cycles # 2.988 GHz + 1,375,635,784 instructions # 3.01 insn per cycle + 0.153610734 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.288759e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.301116e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.301116e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.532609e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.545952e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.545952e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.085024 sec - 238,495,422 cycles # 2.707 GHz - 687,028,266 instructions # 2.88 insn per cycle - 0.088746242 seconds time elapsed +TOTAL : 0.082118 sec + 237,969,637 cycles # 2.802 GHz + 687,046,106 instructions # 2.89 insn per cycle + 0.085530074 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9384) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.395926e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.401596e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.401596e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.413679e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.419255e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.419255e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.039010 sec - 112,073,428 cycles # 2.662 GHz - 253,139,110 instructions # 2.26 insn per cycle - 0.042677736 seconds time elapsed +TOTAL : 0.038562 sec + 111,942,713 cycles # 2.696 GHz + 253,193,714 instructions # 2.26 insn per cycle + 0.042037397 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8538) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.525855e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.532589e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.532589e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.636950e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.645411e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.645411e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.035869 sec - 101,601,884 cycles # 2.611 GHz - 235,894,497 instructions # 2.32 insn per cycle - 0.039518260 seconds time elapsed +TOTAL : 0.033405 sec + 101,008,411 cycles # 2.779 GHz + 235,870,311 instructions # 2.34 insn per cycle + 0.036913146 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8224) (512y: 130) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.142399e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.147704e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.147704e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.199566e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204237e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.204237e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.047633 sec - 88,136,356 cycles # 1.737 GHz - 129,828,247 instructions # 1.47 insn per cycle - 0.051419113 seconds time elapsed +TOTAL : 0.045548 sec + 88,239,313 cycles # 1.813 GHz + 129,874,471 instructions # 1.47 insn per cycle + 0.049263930 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2084) (512y: 104) (512z: 7074) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 5211bad1d2..d3f254d755 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:03:51 +DATE: 2025-12-07_19:49:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.302427e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.704300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.791284e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.368068e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.766941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.851499e+05 ) sec^-1 MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.462607 sec - 2,015,593,801 cycles # 2.836 GHz - 2,784,970,796 instructions # 1.38 insn per cycle - 0.770212174 seconds time elapsed +TOTAL : 0.457335 sec + 2,007,929,467 cycles # 2.846 GHz + 2,825,971,709 instructions # 1.41 insn per cycle + 0.761978303 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.169898e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187942e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.190235e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.174495e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.192560e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.194318e+06 ) sec^-1 MeanMatrixElemValue = ( 8.020495e-03 +- 4.025605e-03 ) GeV^-4 -TOTAL : 0.469557 sec - 2,042,790,873 cycles # 2.836 GHz - 2,884,156,824 instructions # 1.41 insn per cycle - 0.777382571 seconds time elapsed +TOTAL : 0.465604 sec + 2,064,451,568 cycles # 2.898 GHz + 2,880,340,253 instructions # 1.40 insn per cycle + 0.769530883 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.579211e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.582825e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.582825e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.591125e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.594751e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594751e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.149618 sec - 441,460,345 cycles # 2.891 GHz - 1,357,431,891 instructions # 3.07 insn per cycle - 0.153196109 seconds time elapsed +TOTAL : 0.149162 sec + 441,878,747 cycles # 2.903 GHz + 1,358,735,965 instructions # 3.07 insn per cycle + 0.152657350 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -119,9 +113,9 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105256181649E-006 -Relative difference = 5.836526409016727e-08 +Avg ME (C++/C++) = 8.127810e-06 +Avg ME (F77/C++) = 8.1278104929984789E-006 +Relative difference = 6.06557583856253e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.178631e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183684e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183684e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.228946e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.233190e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.233190e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.046713 sec - 133,037,126 cycles # 2.662 GHz - 371,430,035 instructions # 2.79 insn per cycle - 0.050453436 seconds time elapsed +TOTAL : 0.044868 sec + 132,322,826 cycles # 2.762 GHz + 371,490,364 instructions # 2.81 insn per cycle + 0.048437517 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9988) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.599910e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.621223e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.621223e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.747998e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.770201e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.770201e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.022499 sec - 65,701,477 cycles # 2.576 GHz - 142,904,938 instructions # 2.18 insn per cycle - 0.026069649 seconds time elapsed +TOTAL : 0.021194 sec + 65,091,402 cycles # 2.689 GHz + 142,974,817 instructions # 2.20 insn per cycle + 0.024760130 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9322) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.684576e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.708888e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.708888e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.976637e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.002208e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.002208e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.021728 sec - 60,421,247 cycles # 2.428 GHz - 133,158,601 instructions # 2.20 insn per cycle - 0.025465207 seconds time elapsed +TOTAL : 0.019607 sec + 59,954,054 cycles # 2.661 GHz + 133,208,364 instructions # 2.22 insn per cycle + 0.023118866 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9093) (512y: 8) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.239020e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.260813e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.260813e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.326940e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.347007e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.347007e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.025827 sec - 52,150,255 cycles # 1.790 GHz - 79,743,681 instructions # 1.53 insn per cycle - 0.029792364 seconds time elapsed +TOTAL : 0.024798 sec + 51,570,928 cycles # 1.856 GHz + 79,695,428 instructions # 1.55 insn per cycle + 0.028362794 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3031) (512y: 8) (512z: 7424) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index c79acb423d..4da6fe3afd 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:04:20 +DATE: 2025-12-07_19:50:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.351614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.802263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.888038e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.198225e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.565705e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.748849e+05 ) sec^-1 MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.458224 sec - 1,995,767,929 cycles # 2.816 GHz - 2,740,980,318 instructions # 1.37 insn per cycle - 0.766478985 seconds time elapsed +TOTAL : 0.459416 sec + 2,040,155,605 cycles # 2.845 GHz + 2,862,791,638 instructions # 1.40 insn per cycle + 0.778997837 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.181811e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198606e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.200307e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.183282e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201378e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.203023e+06 ) sec^-1 MeanMatrixElemValue = ( 8.020496e-03 +- 4.025606e-03 ) GeV^-4 -TOTAL : 0.469407 sec - 2,020,295,671 cycles # 2.810 GHz - 2,851,658,754 instructions # 1.41 insn per cycle - 0.776046944 seconds time elapsed +TOTAL : 0.469153 sec + 2,032,696,185 cycles # 2.837 GHz + 2,836,419,968 instructions # 1.40 insn per cycle + 0.774320922 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.511421e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.515116e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515116e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.151755 sec - 446,437,299 cycles # 2.884 GHz - 1,359,153,558 instructions # 3.04 insn per cycle - 0.155354916 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.605021e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.608890e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.608890e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.147895 sec + 447,052,123 cycles # 2.963 GHz + 1,360,492,138 instructions # 3.04 insn per cycle + 0.151284972 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1960) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105326147384E-006 -Relative difference = 5.7504445173550794e-08 +Avg ME (F77/C++) = 8.1278105042024615E-006 +Relative difference = 6.100013138863422e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.180553e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.185062e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.185062e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.195275e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200179e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200179e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.045862 sec - 130,422,574 cycles # 2.664 GHz - 366,713,009 instructions # 2.81 insn per cycle - 0.049604747 seconds time elapsed +TOTAL : 0.045248 sec + 129,685,319 cycles # 2.696 GHz + 366,757,162 instructions # 2.83 insn per cycle + 0.048597637 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9971) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.692821e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.714744e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.714744e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.748766e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.771336e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.771336e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.020805 sec - 63,132,535 cycles # 2.647 GHz - 138,133,867 instructions # 2.19 insn per cycle - 0.024434416 seconds time elapsed +TOTAL : 0.020464 sec + 62,901,069 cycles # 2.676 GHz + 138,154,883 instructions # 2.20 insn per cycle + 0.023999005 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9272) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.972359e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.000309e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.000309e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.043138e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.069486e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.069486e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.019005 sec - 58,481,038 cycles # 2.633 GHz - 128,386,986 instructions # 2.20 insn per cycle - 0.022679122 seconds time elapsed +TOTAL : 0.018509 sec + 57,468,378 cycles # 2.682 GHz + 128,434,347 instructions # 2.23 insn per cycle + 0.022020119 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9045) (512y: 8) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.272413e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.292411e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.292411e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.339404e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.357780e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.357780e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.024623 sec - 50,322,119 cycles # 1.806 GHz - 74,992,557 instructions # 1.49 insn per cycle - 0.028526790 seconds time elapsed +TOTAL : 0.023862 sec + 49,483,246 cycles # 1.846 GHz + 74,918,820 instructions # 1.51 insn per cycle + 0.027411583 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2983) (512y: 8) (512z: 7425) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index c43ff17d3c..ba82b66e9f 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:03:05 +DATE: 2025-12-07_19:49:07 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.763173e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.125938e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.192941e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.685280e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.156158e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.458247 sec - 2,022,321,141 cycles # 2.816 GHz - 2,799,483,258 instructions # 1.38 insn per cycle - 0.774798224 seconds time elapsed +TOTAL : 0.457069 sec + 2,059,059,271 cycles # 2.916 GHz + 2,862,194,841 instructions # 1.39 insn per cycle + 0.765173284 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.755571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.866016e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.873910e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.783486e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.892021e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.898730e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.484676 sec - 2,078,557,296 cycles # 2.829 GHz - 2,897,976,393 instructions # 1.39 insn per cycle - 0.794258904 seconds time elapsed +TOTAL : 0.479293 sec + 2,108,094,886 cycles # 2.895 GHz + 3,011,664,194 instructions # 1.43 insn per cycle + 0.789427267 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.388630e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.392004e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.392004e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.536046e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.539412e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.539412e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.157940 sec - 464,903,592 cycles # 2.886 GHz - 1,389,803,957 instructions # 2.99 insn per cycle - 0.161593391 seconds time elapsed +TOTAL : 0.151585 sec + 461,290,442 cycles # 2.988 GHz + 1,385,203,963 instructions # 3.00 insn per cycle + 0.155071029 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562948736117E-006 -Relative difference = 3.32837900190667e-07 +Avg ME (F77/C++) = 8.1274563899879256E-006 +Relative difference = 3.2113506491343336e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.572359e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.584503e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.584503e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.812355e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.826432e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.826432e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.082287 sec - 236,914,725 cycles # 2.777 GHz - 687,861,027 instructions # 2.90 insn per cycle - 0.085920826 seconds time elapsed +TOTAL : 0.079414 sec + 236,109,523 cycles # 2.867 GHz + 687,290,713 instructions # 2.91 insn per cycle + 0.082872327 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563175290919E-006 -Relative difference = 3.3005037703909805e-07 +Avg ME (F77/C++) = 8.1274564132406470E-006 +Relative difference = 3.1827405738783765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.419898e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.425632e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.425632e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.457942e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.463642e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.463642e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.039368 sec - 113,570,815 cycles # 2.680 GHz - 253,055,756 instructions # 2.23 insn per cycle - 0.042992839 seconds time elapsed +TOTAL : 0.038169 sec + 112,677,666 cycles # 2.744 GHz + 252,756,567 instructions # 2.24 insn per cycle + 0.041677201 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8121) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.595281e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.602693e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.602693e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.621535e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.628853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.628853e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.035105 sec - 102,173,670 cycles # 2.666 GHz - 233,820,968 instructions # 2.29 insn per cycle - 0.038810282 seconds time elapsed +TOTAL : 0.034652 sec + 101,371,352 cycles # 2.701 GHz + 233,448,147 instructions # 2.30 insn per cycle + 0.038075913 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7314) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.158210e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.163544e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.163544e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.211323e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.216366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.216366e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.047815 sec - 89,915,156 cycles # 1.766 GHz - 131,317,903 instructions # 1.46 insn per cycle - 0.051535880 seconds time elapsed +TOTAL : 0.045913 sec + 89,949,476 cycles # 1.831 GHz + 131,112,475 instructions # 1.46 insn per cycle + 0.049677353 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1995) (512y: 100) (512z: 6276) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index d6a9bd8585..78839d5595 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:03:28 +DATE: 2025-12-07_19:49:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.669359e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.024328e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.088471e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.685285e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.028169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.086180e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.459467 sec - 2,006,632,193 cycles # 2.818 GHz - 2,802,302,686 instructions # 1.40 insn per cycle - 0.769563513 seconds time elapsed +TOTAL : 0.457729 sec + 2,046,709,495 cycles # 2.882 GHz + 2,878,668,383 instructions # 1.41 insn per cycle + 0.767439326 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.797271e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.897088e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.904896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.794480e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.905511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.912659e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.485964 sec - 2,085,949,128 cycles # 2.828 GHz - 2,970,232,534 instructions # 1.42 insn per cycle - 0.796151358 seconds time elapsed +TOTAL : 0.479165 sec + 2,122,695,273 cycles # 2.913 GHz + 3,025,936,720 instructions # 1.43 insn per cycle + 0.788545450 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.393388e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.396682e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.396682e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.465650e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.472701e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.472701e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.156959 sec - 461,726,786 cycles # 2.887 GHz - 1,385,347,614 instructions # 3.00 insn per cycle - 0.160462326 seconds time elapsed +TOTAL : 0.153841 sec + 459,118,791 cycles # 2.932 GHz + 1,380,772,944 instructions # 3.01 insn per cycle + 0.157281464 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562948736117E-006 -Relative difference = 3.32837900190667e-07 +Avg ME (F77/C++) = 8.1274563899879256E-006 +Relative difference = 3.2113506491343336e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.599813e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.612219e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.612219e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.789044e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.801525e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.801525e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.081200 sec - 234,522,151 cycles # 2.781 GHz - 683,124,885 instructions # 2.91 insn per cycle - 0.084930246 seconds time elapsed +TOTAL : 0.078980 sec + 233,971,329 cycles # 2.851 GHz + 682,635,662 instructions # 2.92 insn per cycle + 0.082568192 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9100) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563175290919E-006 -Relative difference = 3.3005037703909805e-07 +Avg ME (F77/C++) = 8.1274564132406470E-006 +Relative difference = 3.1827405738783765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.420930e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.426598e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.426598e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.447075e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.452932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.452932e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038386 sec - 111,202,178 cycles # 2.675 GHz - 248,277,259 instructions # 2.23 insn per cycle - 0.042154353 seconds time elapsed +TOTAL : 0.037741 sec + 110,759,212 cycles # 2.725 GHz + 247,971,600 instructions # 2.24 insn per cycle + 0.041276309 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8074) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.570276e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.578064e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.578064e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.633232e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.640627e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.640627e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.034958 sec - 100,134,440 cycles # 2.632 GHz - 229,125,035 instructions # 2.29 insn per cycle - 0.038647286 seconds time elapsed +TOTAL : 0.033510 sec + 98,924,853 cycles # 2.714 GHz + 228,784,823 instructions # 2.31 insn per cycle + 0.037054412 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7265) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.164156e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168925e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.168925e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.238196e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.244133e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.244133e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.046899 sec - 87,248,248 cycles # 1.750 GHz - 126,582,829 instructions # 1.45 insn per cycle - 0.050568011 seconds time elapsed +TOTAL : 0.044061 sec + 87,118,484 cycles # 1.851 GHz + 126,325,437 instructions # 1.45 insn per cycle + 0.047721484 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 100) (512z: 6276) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 0619b08e27..e1df6964a4 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:00:50 +DATE: 2025-12-07_19:46:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.353699e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.078498e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.922999e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.103839e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.009770e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.879679e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.530539 sec - 2,259,281,332 cycles # 2.839 GHz - 3,100,637,501 instructions # 1.37 insn per cycle - 0.855479528 seconds time elapsed +TOTAL : 0.525909 sec + 2,302,094,375 cycles # 2.916 GHz + 3,171,107,290 instructions # 1.38 insn per cycle + 0.848071813 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.156775e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.205296e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.205296e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.544566e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.650602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.650602e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.400705 sec - 4,031,222,897 cycles # 2.869 GHz - 9,715,380,409 instructions # 2.41 insn per cycle - 1.406286157 seconds time elapsed +TOTAL : 1.338131 sec + 4,008,874,604 cycles # 2.985 GHz + 9,713,422,817 instructions # 2.42 insn per cycle + 1.343886491 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.450099e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.861491e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.861491e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.524467e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.962458e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.962458e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.838337 sec - 2,350,240,123 cycles # 2.786 GHz - 5,962,397,870 instructions # 2.54 insn per cycle - 0.844193677 seconds time elapsed +TOTAL : 0.799243 sec + 2,348,380,960 cycles # 2.920 GHz + 5,962,486,632 instructions # 2.54 insn per cycle + 0.804968828 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1351) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.162719e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.161528e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.161528e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.278741e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.337558e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.337558e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.600854 sec - 1,671,713,001 cycles # 2.758 GHz - 3,319,973,297 instructions # 1.99 insn per cycle - 0.606663801 seconds time elapsed +TOTAL : 0.571530 sec + 1,663,247,245 cycles # 2.886 GHz + 3,320,387,726 instructions # 2.00 insn per cycle + 0.576968895 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1492) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.261662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.349890e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.349890e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.328633e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.431086e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.431086e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.577948 sec - 1,617,041,581 cycles # 2.773 GHz - 3,291,143,565 instructions # 2.04 insn per cycle - 0.583833732 seconds time elapsed +TOTAL : 0.561841 sec + 1,623,155,360 cycles # 2.864 GHz + 3,291,449,761 instructions # 2.03 insn per cycle + 0.567370900 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1367) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.100149e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.993172e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.993172e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.208742e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.166144e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.166144e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.615039 sec - 1,364,172,223 cycles # 2.200 GHz - 2,429,556,714 instructions # 1.78 insn per cycle - 0.620861975 seconds time elapsed +TOTAL : 0.586473 sec + 1,362,786,393 cycles # 2.305 GHz + 2,429,655,520 instructions # 1.78 insn per cycle + 0.591915895 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 583) (512y: 60) (512z: 1009) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index 071e7697d0..ed1c04005d 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:01:05 +DATE: 2025-12-07_19:47:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.417263e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.094810e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.959655e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.190845e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.980293e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.856491e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.525108 sec - 2,234,624,938 cycles # 2.820 GHz - 3,124,481,460 instructions # 1.40 insn per cycle - 0.850037014 seconds time elapsed +TOTAL : 0.523584 sec + 2,298,144,522 cycles # 2.916 GHz + 3,190,060,913 instructions # 1.39 insn per cycle + 0.846016631 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.289834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.373214e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.373214e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.610904e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.743927e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.743927e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.378734 sec - 3,995,674,296 cycles # 2.888 GHz - 9,595,338,306 instructions # 2.40 insn per cycle - 1.384441945 seconds time elapsed +TOTAL : 1.329212 sec + 3,995,190,163 cycles # 2.995 GHz + 9,595,722,950 instructions # 2.40 insn per cycle + 1.334961663 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.457938e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.874008e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.874008e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.478658e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.887776e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.887776e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.834586 sec - 2,348,281,075 cycles # 2.796 GHz - 5,903,694,010 instructions # 2.51 insn per cycle - 0.840556806 seconds time elapsed +TOTAL : 0.821267 sec + 2,354,024,065 cycles # 2.850 GHz + 5,904,015,036 instructions # 2.51 insn per cycle + 0.826852524 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1329) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.178686e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.194593e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.255647e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.310954e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.310954e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.595816 sec - 1,665,750,464 cycles # 2.772 GHz - 3,289,499,758 instructions # 1.97 insn per cycle - 0.601728408 seconds time elapsed +TOTAL : 0.576119 sec + 1,667,061,327 cycles # 2.870 GHz + 3,289,696,900 instructions # 1.97 insn per cycle + 0.581664029 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1437) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.254319e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.335615e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.335615e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.278676e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367476e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367476e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.579487 sec - 1,624,326,903 cycles # 2.777 GHz - 3,265,891,511 instructions # 2.01 insn per cycle - 0.585419257 seconds time elapsed +TOTAL : 0.570789 sec + 1,622,878,680 cycles # 2.819 GHz + 3,265,983,749 instructions # 2.01 insn per cycle + 0.576514933 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1330) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.069886e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.953317e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.953317e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.080746e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.958062e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.958062e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.621553 sec - 1,373,190,892 cycles # 2.193 GHz - 2,413,828,053 instructions # 1.76 insn per cycle - 0.627336488 seconds time elapsed +TOTAL : 0.621909 sec + 1,368,128,009 cycles # 2.189 GHz + 2,413,830,699 instructions # 1.76 insn per cycle + 0.628646929 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 550) (512y: 60) (512z: 1005) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 6216dff6c8..0d727faa2a 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:01:47 +DATE: 2025-12-07_19:47:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.174946e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.068173e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.272719e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.124140e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.942066e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.189451e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.489126 sec - 2,124,007,963 cycles # 2.815 GHz - 2,945,321,471 instructions # 1.39 insn per cycle - 0.811539193 seconds time elapsed +TOTAL : 0.490381 sec + 2,199,445,619 cycles # 2.908 GHz + 3,018,337,846 instructions # 1.37 insn per cycle + 0.815630142 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.779077e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.006315e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.006315e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.132129e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045703e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045703e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.286813 sec - 3,697,266,650 cycles # 2.863 GHz - 9,611,683,530 instructions # 2.60 insn per cycle - 1.292373810 seconds time elapsed +TOTAL : 1.237096 sec + 3,699,455,560 cycles # 2.981 GHz + 9,611,506,441 instructions # 2.60 insn per cycle + 1.242342841 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.204438e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.350250e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350250e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.261244e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.406921e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.406921e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.567715 sec - 1,640,656,743 cycles # 2.864 GHz - 3,979,080,194 instructions # 2.43 insn per cycle - 0.573454265 seconds time elapsed +TOTAL : 0.551485 sec + 1,637,865,132 cycles # 2.945 GHz + 3,979,567,344 instructions # 2.43 insn per cycle + 0.556726338 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1553) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.953501e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.188885e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.188885e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.061175e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.431033e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.431033e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.446090 sec - 1,257,376,904 cycles # 2.787 GHz - 2,504,409,181 instructions # 1.99 insn per cycle - 0.451851006 seconds time elapsed +TOTAL : 0.430113 sec + 1,257,134,512 cycles # 2.893 GHz + 2,504,344,250 instructions # 1.99 insn per cycle + 0.435268701 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1915) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.026066e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.404220e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.404220e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.174394e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.682411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.682411e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.438014 sec - 1,235,323,979 cycles # 2.788 GHz - 2,479,535,477 instructions # 2.01 insn per cycle - 0.443692621 seconds time elapsed +TOTAL : 0.417709 sec + 1,228,070,078 cycles # 2.907 GHz + 2,479,961,178 instructions # 2.02 insn per cycle + 0.422978295 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1861) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.854396e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.809242e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.809242e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.966772e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.032653e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.032653e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.460001 sec - 1,078,883,681 cycles # 2.321 GHz - 2,076,270,716 instructions # 1.92 insn per cycle - 0.465628674 seconds time elapsed +TOTAL : 0.443129 sec + 1,080,880,082 cycles # 2.414 GHz + 2,076,451,681 instructions # 1.92 insn per cycle + 0.448409141 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1014) (512y: 5) (512z: 1276) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index b9e5df5750..2c81bdd186 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:02:06 +DATE: 2025-12-07_19:48:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.174766e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.032980e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.224739e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.136734e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.033265e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.238271e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.489051 sec - 2,148,781,052 cycles # 2.834 GHz - 2,942,650,451 instructions # 1.37 insn per cycle - 0.815858067 seconds time elapsed +TOTAL : 0.485763 sec + 2,172,654,366 cycles # 2.923 GHz + 2,981,441,098 instructions # 1.37 insn per cycle + 0.802100928 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.862221e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.017701e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.017701e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.239222e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.059869e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059869e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.273068 sec - 3,660,086,626 cycles # 2.864 GHz - 9,502,319,452 instructions # 2.60 insn per cycle - 1.278709233 seconds time elapsed +TOTAL : 1.222232 sec + 3,670,272,153 cycles # 2.992 GHz + 9,502,404,750 instructions # 2.59 insn per cycle + 1.227685794 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 370) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.092947e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.109735e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.109735e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.212133e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326573e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.326573e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.591777 sec - 1,671,501,463 cycles # 2.802 GHz - 3,947,247,316 instructions # 2.36 insn per cycle - 0.597353565 seconds time elapsed +TOTAL : 0.561480 sec + 1,654,306,122 cycles # 2.922 GHz + 3,947,430,485 instructions # 2.39 insn per cycle + 0.566880450 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1510) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.904335e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.013564e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.013564e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.087901e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.450467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.450467e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.451671 sec - 1,251,161,997 cycles # 2.741 GHz - 2,488,699,975 instructions # 1.99 insn per cycle - 0.457155054 seconds time elapsed +TOTAL : 0.425959 sec + 1,250,398,764 cycles # 2.904 GHz + 2,488,899,968 instructions # 1.99 insn per cycle + 0.431230738 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1819) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.993855e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.299058e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.299058e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.177891e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.682250e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.682250e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.440947 sec - 1,225,739,794 cycles # 2.746 GHz - 2,464,639,586 instructions # 2.01 insn per cycle - 0.448602225 seconds time elapsed +TOTAL : 0.416707 sec + 1,224,598,764 cycles # 2.907 GHz + 2,464,135,945 instructions # 2.01 insn per cycle + 0.422026666 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1777) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.880064e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.891083e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.891083e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.020915e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.161468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.161468e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.454521 sec - 1,073,931,359 cycles # 2.337 GHz - 2,059,749,623 instructions # 1.92 insn per cycle - 0.460150581 seconds time elapsed +TOTAL : 0.434297 sec + 1,071,378,612 cycles # 2.441 GHz + 2,059,923,240 instructions # 1.92 insn per cycle + 0.439585784 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 909) (512y: 5) (512z: 1267) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 5e30b14ca9..690a806a11 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:01:19 +DATE: 2025-12-07_19:47:20 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.446721e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.093075e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.939789e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.139729e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.002197e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.873734e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.525703 sec - 2,236,736,054 cycles # 2.823 GHz - 3,119,267,572 instructions # 1.39 insn per cycle - 0.849597854 seconds time elapsed +TOTAL : 0.523733 sec + 2,304,920,574 cycles # 2.921 GHz + 3,176,904,198 instructions # 1.38 insn per cycle + 0.846770687 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.117543e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.151188e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.151188e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.334961e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.411459e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.411459e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.406267 sec - 4,043,925,432 cycles # 2.865 GHz - 9,738,556,635 instructions # 2.41 insn per cycle - 1.412149316 seconds time elapsed +TOTAL : 1.371675 sec + 4,066,270,765 cycles # 2.955 GHz + 9,734,658,906 instructions # 2.39 insn per cycle + 1.377563148 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956651651408 +Relative difference = 2.2666921605767905e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.480932e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.914447e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914447e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.523419e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.970680e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.970680e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.824504 sec - 2,316,933,637 cycles # 2.792 GHz - 5,851,816,983 instructions # 2.53 insn per cycle - 0.830593669 seconds time elapsed +TOTAL : 0.800435 sec + 2,315,652,633 cycles # 2.875 GHz + 5,848,682,232 instructions # 2.53 insn per cycle + 0.806107909 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1366) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956707375011 +Relative difference = 2.2289696081807308e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.246053e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.337007e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.337007e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.337550e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.500598e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.500598e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.582389 sec - 1,613,472,858 cycles # 2.745 GHz - 3,206,778,468 instructions # 1.99 insn per cycle - 0.588460320 seconds time elapsed +TOTAL : 0.561301 sec + 1,623,114,698 cycles # 2.867 GHz + 3,203,805,266 instructions # 1.97 insn per cycle + 0.566923941 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1531) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.322435e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.481610e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.481610e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.400067e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.625477e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.625477e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.567372 sec - 1,569,665,304 cycles # 2.742 GHz - 3,175,442,225 instructions # 2.02 insn per cycle - 0.573184846 seconds time elapsed +TOTAL : 0.548505 sec + 1,583,165,678 cycles # 2.861 GHz + 3,172,490,490 instructions # 2.00 insn per cycle + 0.554060805 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1435) (512y: 101) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.075660e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.951397e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951397e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.259094e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.276173e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.276173e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.621447 sec - 1,359,798,497 cycles # 2.170 GHz - 2,353,126,759 instructions # 1.73 insn per cycle - 0.627307566 seconds time elapsed +TOTAL : 0.578497 sec + 1,340,102,300 cycles # 2.297 GHz + 2,348,981,689 instructions # 1.75 insn per cycle + 0.584128436 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 738) (512y: 64) (512z: 1042) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index 3f206f95bd..dba374195b 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:01:33 +DATE: 2025-12-07_19:47:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.462369e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.119008e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.948835e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.124398e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.028345e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.905054e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.522593 sec - 2,229,764,062 cycles # 2.824 GHz - 3,122,707,099 instructions # 1.40 insn per cycle - 0.846718941 seconds time elapsed +TOTAL : 0.524030 sec + 2,294,869,845 cycles # 2.905 GHz + 3,124,281,704 instructions # 1.36 insn per cycle + 0.847261354 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.222292e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.282147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.282147e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.453923e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.517612e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.517612e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.390029 sec - 4,041,827,914 cycles # 2.897 GHz - 9,620,480,831 instructions # 2.38 insn per cycle - 1.395839351 seconds time elapsed +TOTAL : 1.351199 sec + 4,052,698,052 cycles # 2.989 GHz + 9,617,459,016 instructions # 2.37 insn per cycle + 1.356815416 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956651651408 +Relative difference = 2.2666921605767905e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.484588e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.916467e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.916467e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.622046e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.122426e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.122426e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.821088 sec - 2,277,892,232 cycles # 2.757 GHz - 5,806,859,822 instructions # 2.55 insn per cycle - 0.826926685 seconds time elapsed +TOTAL : 0.758621 sec + 2,283,316,079 cycles # 2.991 GHz + 5,803,459,515 instructions # 2.54 insn per cycle + 0.764241004 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1349) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956707375011 +Relative difference = 2.2289696081807308e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.285308e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.418349e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.418349e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.356788e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.517227e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.517227e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.573049 sec - 1,611,028,972 cycles # 2.786 GHz - 3,186,162,266 instructions # 1.98 insn per cycle - 0.579129244 seconds time elapsed +TOTAL : 0.555767 sec + 1,613,406,813 cycles # 2.878 GHz + 3,183,077,724 instructions # 1.97 insn per cycle + 0.561360752 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1474) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.356503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.544553e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.544553e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.412001e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.611861e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.611861e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.558398 sec - 1,559,160,941 cycles # 2.767 GHz - 3,150,562,622 instructions # 2.02 insn per cycle - 0.564070384 seconds time elapsed +TOTAL : 0.547024 sec + 1,562,861,700 cycles # 2.831 GHz + 3,147,728,426 instructions # 2.01 insn per cycle + 0.552530608 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1373) (512y: 101) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.173215e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.148914e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.148914e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.283003e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.331433e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331433e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.596537 sec - 1,348,900,555 cycles # 2.242 GHz - 2,335,239,112 instructions # 1.73 insn per cycle - 0.602236132 seconds time elapsed +TOTAL : 0.570042 sec + 1,349,178,373 cycles # 2.347 GHz + 2,333,121,597 instructions # 1.73 insn per cycle + 0.575586193 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 687) (512y: 64) (512z: 1030) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index e3ea0d9299..5a85972ae8 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:57:54 +DATE: 2025-12-07_19:43:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.706908e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.160258e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.561103e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.695169e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.168936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.571476e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.544889 sec - 2,278,331,746 cycles # 2.802 GHz - 3,194,429,442 instructions # 1.40 insn per cycle - 0.872956184 seconds time elapsed +TOTAL : 0.536307 sec + 2,334,975,820 cycles # 2.883 GHz + 3,250,882,834 instructions # 1.39 insn per cycle + 0.866674525 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.781718e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.827404e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.827404e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.839731e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.886937e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.886937e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.994100 sec - 17,282,311,221 cycles # 2.881 GHz - 46,327,593,495 instructions # 2.68 insn per cycle - 5.999488168 seconds time elapsed +TOTAL : 5.805799 sec + 17,315,222,850 cycles # 2.980 GHz + 46,330,411,224 instructions # 2.68 insn per cycle + 5.811451379 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.117362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271065e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.271065e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.229187e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.389829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.389829e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.473625 sec - 10,058,480,748 cycles # 2.892 GHz - 27,928,334,913 instructions # 2.78 insn per cycle - 3.479625370 seconds time elapsed +TOTAL : 3.354438 sec + 10,038,365,443 cycles # 2.988 GHz + 27,926,383,719 instructions # 2.78 insn per cycle + 3.360110009 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2526) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.891803e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.272223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.272223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.115948e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.514806e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.514806e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.253673 sec - 6,113,479,898 cycles # 2.707 GHz - 12,619,681,498 instructions # 2.06 insn per cycle - 2.259543422 seconds time elapsed +TOTAL : 2.157045 sec + 6,079,245,979 cycles # 2.812 GHz + 12,619,360,891 instructions # 2.08 insn per cycle + 2.162708416 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2620) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.064851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.470121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.470121e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.171510e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.579335e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.579335e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.179283 sec - 5,867,669,279 cycles # 2.687 GHz - 12,194,655,166 instructions # 2.08 insn per cycle - 2.184803472 seconds time elapsed +TOTAL : 2.134472 sec + 5,872,246,359 cycles # 2.745 GHz + 12,194,308,603 instructions # 2.08 insn per cycle + 2.140176765 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2417) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.394256e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.568035e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.568035e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.617303e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.809520e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.809520e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.199079 sec - 5,758,256,477 cycles # 1.797 GHz - 8,312,435,809 instructions # 1.44 insn per cycle - 3.204885362 seconds time elapsed +TOTAL : 3.005476 sec + 5,738,126,083 cycles # 1.906 GHz + 8,311,838,655 instructions # 1.45 insn per cycle + 3.011143334 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1464) (512y: 100) (512z: 1805) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index 85796cb2e8..832826c428 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:58:23 +DATE: 2025-12-07_19:44:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.750318e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.090521e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.471741e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.693214e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.059340e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.449286e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.536193 sec - 2,280,468,803 cycles # 2.831 GHz - 3,171,048,990 instructions # 1.39 insn per cycle - 0.862856350 seconds time elapsed +TOTAL : 0.532880 sec + 2,332,176,383 cycles # 2.922 GHz + 3,261,356,610 instructions # 1.40 insn per cycle + 0.855846908 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.830968e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.879197e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.879197e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.900047e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949830e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949830e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.834979 sec - 16,842,100,019 cycles # 2.884 GHz - 45,296,854,647 instructions # 2.69 insn per cycle - 5.840673910 seconds time elapsed +TOTAL : 5.624567 sec + 16,858,951,225 cycles # 2.995 GHz + 45,295,432,089 instructions # 2.69 insn per cycle + 5.630073851 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.286582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.457425e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.457425e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.389562e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566761e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566761e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.299071 sec - 9,574,991,301 cycles # 2.898 GHz - 26,751,055,486 instructions # 2.79 insn per cycle - 3.304842345 seconds time elapsed +TOTAL : 3.200910 sec + 9,589,422,384 cycles # 2.992 GHz + 26,750,550,559 instructions # 2.79 insn per cycle + 3.206300891 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.483668e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.795787e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.795787e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.667348e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.998446e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.998446e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.446633 sec - 6,630,126,092 cycles # 2.705 GHz - 14,155,939,252 instructions # 2.14 insn per cycle - 2.452232412 seconds time elapsed +TOTAL : 2.352583 sec + 6,638,006,315 cycles # 2.816 GHz + 14,155,497,948 instructions # 2.13 insn per cycle + 2.358191591 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.633646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.966509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.966509e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.869553e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.224509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.224509e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.371147 sec - 6,420,781,885 cycles # 2.703 GHz - 13,756,522,591 instructions # 2.14 insn per cycle - 2.376767940 seconds time elapsed +TOTAL : 2.257825 sec + 6,396,924,362 cycles # 2.827 GHz + 13,754,813,192 instructions # 2.15 insn per cycle + 2.263475683 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2358) (512y: 297) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.247851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404590e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404590e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.451036e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624681e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624681e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.336819 sec - 5,939,444,089 cycles # 1.778 GHz - 10,130,416,003 instructions # 1.71 insn per cycle - 3.342426568 seconds time elapsed +TOTAL : 3.143993 sec + 5,947,741,194 cycles # 1.889 GHz + 10,129,599,687 instructions # 1.70 insn per cycle + 3.149707043 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1321) (512y: 208) (512z: 1987) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index e92931017f..45f87ccac3 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:59:57 +DATE: 2025-12-07_19:45:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.265470e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.796248e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.925275e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.988433e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.783744e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.924089e+08 ) sec^-1 MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.494715 sec - 2,133,928,532 cycles # 2.829 GHz - 2,961,237,291 instructions # 1.39 insn per cycle - 0.812186327 seconds time elapsed +TOTAL : 0.493091 sec + 2,172,173,356 cycles # 2.903 GHz + 3,027,636,183 instructions # 1.39 insn per cycle + 0.807470670 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 97 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.878391e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.930853e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.930853e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.948551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.003596e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003596e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.670408 sec - 16,367,724,454 cycles # 2.885 GHz - 45,532,008,663 instructions # 2.78 insn per cycle - 5.675967017 seconds time elapsed +TOTAL : 5.466081 sec + 16,382,316,202 cycles # 2.995 GHz + 45,532,503,408 instructions # 2.78 insn per cycle + 5.471212634 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.407671e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.731067e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.731067e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.587728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.925902e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.925902e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.467869 sec - 7,095,747,201 cycles # 2.870 GHz - 17,858,347,842 instructions # 2.52 insn per cycle - 2.473312825 seconds time elapsed +TOTAL : 2.371100 sec + 7,096,054,794 cycles # 2.987 GHz + 17,857,928,594 instructions # 2.52 insn per cycle + 2.376274212 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3126) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.089358e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.160867e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.160867e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.444777e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.586575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.586575e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.384690 sec - 3,760,865,125 cycles # 2.707 GHz - 8,296,401,814 instructions # 2.21 insn per cycle - 1.390188663 seconds time elapsed +TOTAL : 1.326130 sec + 3,757,507,462 cycles # 2.824 GHz + 8,296,635,656 instructions # 2.21 insn per cycle + 1.331317435 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3371) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.420631e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.588852e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.588852e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.668353e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.880442e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.880442e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.334053 sec - 3,653,512,814 cycles # 2.729 GHz - 8,025,167,005 instructions # 2.20 insn per cycle - 1.339479555 seconds time elapsed +TOTAL : 1.296492 sec + 3,659,706,971 cycles # 2.813 GHz + 8,025,181,012 instructions # 2.19 insn per cycle + 1.301645481 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3272) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.300716e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.921877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.921877e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.501831e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.156873e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.156873e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.752788 sec - 3,290,640,509 cycles # 1.873 GHz - 6,097,403,848 instructions # 1.85 insn per cycle - 1.758187036 seconds time elapsed +TOTAL : 1.699911 sec + 3,265,684,686 cycles # 1.917 GHz + 6,094,773,044 instructions # 1.87 insn per cycle + 1.705336795 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2262) (512y: 0) (512z: 2152) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index 890303a8f4..946f33a1dd 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_17:00:25 +DATE: 2025-12-07_19:46:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.221580e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.787567e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.918978e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.014623e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.786688e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925274e+08 ) sec^-1 MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.494192 sec - 2,133,895,255 cycles # 2.826 GHz - 2,984,971,388 instructions # 1.40 insn per cycle - 0.812316425 seconds time elapsed +TOTAL : 0.488419 sec + 2,208,795,761 cycles # 2.913 GHz + 3,059,627,680 instructions # 1.39 insn per cycle + 0.815286723 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.920936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.975706e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.975706e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.992912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.050113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050113e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.545042 sec - 16,055,557,680 cycles # 2.893 GHz - 44,606,147,249 instructions # 2.78 insn per cycle - 5.550363279 seconds time elapsed +TOTAL : 5.345820 sec + 16,056,525,401 cycles # 3.001 GHz + 44,602,107,569 instructions # 2.78 insn per cycle + 5.351176321 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 534) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.166744e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.616602e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.616602e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.372474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.846920e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.846920e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.117207 sec - 6,107,535,010 cycles # 2.878 GHz - 17,151,265,141 instructions # 2.81 insn per cycle - 2.122735579 seconds time elapsed +TOTAL : 2.036350 sec + 6,117,093,292 cycles # 2.997 GHz + 17,149,676,722 instructions # 2.80 insn per cycle + 2.041757490 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.890362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.440713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.440713e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.185599e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.772122e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.772122e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.868040 sec - 5,037,008,594 cycles # 2.691 GHz - 10,256,105,804 instructions # 2.04 insn per cycle - 1.873591030 seconds time elapsed +TOTAL : 1.779055 sec + 5,034,985,650 cycles # 2.823 GHz + 10,255,707,126 instructions # 2.04 insn per cycle + 1.784516958 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3910) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.987209e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.558432e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.558432e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.252769e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.852205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.852205e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.838312 sec - 4,976,298,083 cycles # 2.700 GHz - 10,027,200,665 instructions # 2.01 insn per cycle - 1.843999254 seconds time elapsed +TOTAL : 1.761024 sec + 4,984,094,715 cycles # 2.823 GHz + 10,027,122,089 instructions # 2.01 insn per cycle + 1.766359503 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3807) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.543540e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.857388e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.857388e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.831200e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.174658e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.174658e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 2.395195 sec - 4,386,171,031 cycles # 1.828 GHz - 8,457,161,359 instructions # 1.93 insn per cycle - 2.400661750 seconds time elapsed +TOTAL : 2.255672 sec + 4,386,263,631 cycles # 1.941 GHz + 8,456,924,819 instructions # 1.93 insn per cycle + 2.261055103 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2747) (512y: 4) (512z: 2749) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 2e4f76055c..03b506896b 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:58:53 +DATE: 2025-12-07_19:44:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.803206e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.197061e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.595248e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.697530e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.156410e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.559096e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.542499 sec - 2,291,067,565 cycles # 2.822 GHz - 3,214,215,859 instructions # 1.40 insn per cycle - 0.903410898 seconds time elapsed +TOTAL : 0.536253 sec + 2,356,041,587 cycles # 2.928 GHz + 3,267,502,881 instructions # 1.39 insn per cycle + 0.863412693 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.773351e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.818033e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.818033e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.832571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.878583e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.878583e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 6.022953 sec - 17,468,685,186 cycles # 2.898 GHz - 46,428,017,151 instructions # 2.66 insn per cycle - 6.028694923 seconds time elapsed +TOTAL : 5.829216 sec + 17,514,318,889 cycles # 3.003 GHz + 46,410,871,848 instructions # 2.65 insn per cycle + 5.834871638 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359161343524 +Relative difference = 4.160340809458261e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.098858e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.251324e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.251324e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.222444e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.381226e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.381226e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.494063 sec - 10,018,252,515 cycles # 2.863 GHz - 27,545,325,597 instructions # 2.75 insn per cycle - 3.499809973 seconds time elapsed +TOTAL : 3.361736 sec + 10,051,325,016 cycles # 2.986 GHz + 27,532,416,492 instructions # 2.74 insn per cycle + 3.367454885 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359221973261 +Relative difference = 3.8595736002871474e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.882400e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.252051e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.252051e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.199636e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.611712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.611712e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.257811 sec - 5,988,198,927 cycles # 2.647 GHz - 12,439,095,003 instructions # 2.08 insn per cycle - 2.263664182 seconds time elapsed +TOTAL : 2.122745 sec + 5,984,881,615 cycles # 2.813 GHz + 12,425,093,867 instructions # 2.08 insn per cycle + 2.128472371 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2756) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.259591e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.697101e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.697101e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.398418e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.838367e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.838367e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.102985 sec - 5,735,490,837 cycles # 2.721 GHz - 12,004,650,662 instructions # 2.09 insn per cycle - 2.108573871 seconds time elapsed +TOTAL : 2.049910 sec + 5,718,799,048 cycles # 2.783 GHz + 11,992,088,713 instructions # 2.10 insn per cycle + 2.055397864 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2556) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.518029e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.702687e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.702687e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.689183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.890266e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.890266e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.089670 sec - 5,573,654,696 cycles # 1.801 GHz - 7,983,962,804 instructions # 1.43 insn per cycle - 3.095529304 seconds time elapsed +TOTAL : 2.950384 sec + 5,579,773,514 cycles # 1.888 GHz + 7,973,844,625 instructions # 1.43 insn per cycle + 2.956194588 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1826) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index 09594959d7..e11c8c6adb 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:59:25 +DATE: 2025-12-07_19:45:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.800950e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.127229e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.485215e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.700239e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.060058e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.452020e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.537601 sec - 2,294,644,932 cycles # 2.834 GHz - 3,202,661,173 instructions # 1.40 insn per cycle - 0.866738405 seconds time elapsed +TOTAL : 0.533696 sec + 2,324,683,344 cycles # 2.913 GHz + 3,231,621,148 instructions # 1.39 insn per cycle + 0.855845935 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.809865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.856790e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.856790e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.878524e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.927200e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927200e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.902916 sec - 17,031,724,118 cycles # 2.883 GHz - 45,397,065,381 instructions # 2.67 insn per cycle - 5.908631173 seconds time elapsed +TOTAL : 5.687762 sec + 17,084,373,938 cycles # 3.002 GHz + 45,380,156,717 instructions # 2.66 insn per cycle + 5.693485573 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359161343524 +Relative difference = 4.160340809458261e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.294098e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.465793e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.465793e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.389130e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566641e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566641e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.291976 sec - 9,561,103,669 cycles # 2.900 GHz - 26,144,822,297 instructions # 2.73 insn per cycle - 3.297670541 seconds time elapsed +TOTAL : 3.200115 sec + 9,593,301,738 cycles # 2.994 GHz + 26,131,303,903 instructions # 2.72 insn per cycle + 3.205832807 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2347) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359221973261 +Relative difference = 3.8595736002871474e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.426643e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.734905e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.734905e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.595031e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.916041e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.916041e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.478214 sec - 6,700,126,016 cycles # 2.700 GHz - 13,943,282,534 instructions # 2.08 insn per cycle - 2.483989370 seconds time elapsed +TOTAL : 2.388134 sec + 6,729,617,513 cycles # 2.812 GHz + 13,930,484,059 instructions # 2.07 insn per cycle + 2.393824529 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2871) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.620283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.949819e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.949819e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.851976e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.199068e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.199068e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.378094 sec - 6,404,718,099 cycles # 2.688 GHz - 13,458,943,081 instructions # 2.10 insn per cycle - 2.383779382 seconds time elapsed +TOTAL : 2.265993 sec + 6,391,153,285 cycles # 2.815 GHz + 13,446,522,820 instructions # 2.10 insn per cycle + 2.271479066 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2508) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.539955e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.726603e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.726603e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.692883e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.893052e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.893052e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.070043 sec - 5,557,581,294 cycles # 1.808 GHz - 9,121,741,259 instructions # 1.64 insn per cycle - 3.075761617 seconds time elapsed +TOTAL : 2.944942 sec + 5,568,690,221 cycles # 1.888 GHz + 9,111,931,553 instructions # 1.64 insn per cycle + 2.950612470 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2027) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) =========================================================================